diff --git a/.nojekyll b/.nojekyll
index ef7251a71..d01abb6bb 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-889158ab
\ No newline at end of file
+c0919e6e
\ No newline at end of file
diff --git a/FAQS.html b/FAQS.html
index d89fbc69a..d1526daee 100644
--- a/FAQS.html
+++ b/FAQS.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="./docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/amd_hpc.html b/docs/amd_hpc.html
index dad94ad5a..539e4cb5b 100644
--- a/docs/amd_hpc.html
+++ b/docs/amd_hpc.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.args.html b/docs/api/cli.args.html
index f832d6988..020eb2bd7 100644
--- a/docs/api/cli.args.html
+++ b/docs/api/cli.args.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.art.html b/docs/api/cli.art.html
index 4abcd487d..9f6538700 100644
--- a/docs/api/cli.art.html
+++ b/docs/api/cli.art.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.checks.html b/docs/api/cli.checks.html
index 0741b6119..099b96e68 100644
--- a/docs/api/cli.checks.html
+++ b/docs/api/cli.checks.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.cloud.base.html b/docs/api/cli.cloud.base.html
index 9d57e9e51..36c8d6e5f 100644
--- a/docs/api/cli.cloud.base.html
+++ b/docs/api/cli.cloud.base.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.cloud.modal_.html b/docs/api/cli.cloud.modal_.html
index 8c8c1227c..c4efe0eaf 100644
--- a/docs/api/cli.cloud.modal_.html
+++ b/docs/api/cli.cloud.modal_.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.config.html b/docs/api/cli.config.html
index 71ebb2859..a55dc936c 100644
--- a/docs/api/cli.config.html
+++ b/docs/api/cli.config.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.delinearize_llama4.html b/docs/api/cli.delinearize_llama4.html
index b84b2dd92..ba0da03e2 100644
--- a/docs/api/cli.delinearize_llama4.html
+++ b/docs/api/cli.delinearize_llama4.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.evaluate.html b/docs/api/cli.evaluate.html
index 25740c57d..520165a20 100644
--- a/docs/api/cli.evaluate.html
+++ b/docs/api/cli.evaluate.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.inference.html b/docs/api/cli.inference.html
index 4e071a1a7..b779d1828 100644
--- a/docs/api/cli.inference.html
+++ b/docs/api/cli.inference.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.main.html b/docs/api/cli.main.html
index 49005646a..c21a3fdc6 100644
--- a/docs/api/cli.main.html
+++ b/docs/api/cli.main.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.merge_lora.html b/docs/api/cli.merge_lora.html
index e57c9eaea..327b0a870 100644
--- a/docs/api/cli.merge_lora.html
+++ b/docs/api/cli.merge_lora.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.merge_sharded_fsdp_weights.html b/docs/api/cli.merge_sharded_fsdp_weights.html
index dcbaa2943..58863d793 100644
--- a/docs/api/cli.merge_sharded_fsdp_weights.html
+++ b/docs/api/cli.merge_sharded_fsdp_weights.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.preprocess.html b/docs/api/cli.preprocess.html
index 269570e0b..f61a799de 100644
--- a/docs/api/cli.preprocess.html
+++ b/docs/api/cli.preprocess.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.quantize.html b/docs/api/cli.quantize.html
index fae1e37a7..6c0bc1e04 100644
--- a/docs/api/cli.quantize.html
+++ b/docs/api/cli.quantize.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.train.html b/docs/api/cli.train.html
index b81c76d6b..18e8df995 100644
--- a/docs/api/cli.train.html
+++ b/docs/api/cli.train.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.args.html b/docs/api/cli.utils.args.html
index 0c60dcf35..1a8a0c36d 100644
--- a/docs/api/cli.utils.args.html
+++ b/docs/api/cli.utils.args.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.fetch.html b/docs/api/cli.utils.fetch.html
index 674d5295b..90297b30f 100644
--- a/docs/api/cli.utils.fetch.html
+++ b/docs/api/cli.utils.fetch.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.html b/docs/api/cli.utils.html
index 42df12218..bc36f317f 100644
--- a/docs/api/cli.utils.html
+++ b/docs/api/cli.utils.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.load.html b/docs/api/cli.utils.load.html
index 9387e3ccb..5498365e2 100644
--- a/docs/api/cli.utils.load.html
+++ b/docs/api/cli.utils.load.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.sweeps.html b/docs/api/cli.utils.sweeps.html
index 00a3e82fe..25f24aace 100644
--- a/docs/api/cli.utils.sweeps.html
+++ b/docs/api/cli.utils.sweeps.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.train.html b/docs/api/cli.utils.train.html
index 32f490174..761fcfc93 100644
--- a/docs/api/cli.utils.train.html
+++ b/docs/api/cli.utils.train.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.vllm_serve.html b/docs/api/cli.vllm_serve.html
index a5c9aeac6..1f1861078 100644
--- a/docs/api/cli.vllm_serve.html
+++ b/docs/api/cli.vllm_serve.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/common.architectures.html b/docs/api/common.architectures.html
index a46b60f46..990e3b89b 100644
--- a/docs/api/common.architectures.html
+++ b/docs/api/common.architectures.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/common.const.html b/docs/api/common.const.html
index f511ffc96..6a4c5c5eb 100644
--- a/docs/api/common.const.html
+++ b/docs/api/common.const.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/common.datasets.html b/docs/api/common.datasets.html
index 7832139ab..7eefb5799 100644
--- a/docs/api/common.datasets.html
+++ b/docs/api/common.datasets.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/convert.html b/docs/api/convert.html
index bb218da2a..852a2cd62 100644
--- a/docs/api/convert.html
+++ b/docs/api/convert.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.builders.base.html b/docs/api/core.builders.base.html
index dccba45fd..d62dc201b 100644
--- a/docs/api/core.builders.base.html
+++ b/docs/api/core.builders.base.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.builders.causal.html b/docs/api/core.builders.causal.html
index fd114515d..5520dd536 100644
--- a/docs/api/core.builders.causal.html
+++ b/docs/api/core.builders.causal.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.builders.rl.html b/docs/api/core.builders.rl.html
index ab7714d6b..e6b7759da 100644
--- a/docs/api/core.builders.rl.html
+++ b/docs/api/core.builders.rl.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.chat.format.chatml.html b/docs/api/core.chat.format.chatml.html
index 8cb84c56d..caf6215f5 100644
--- a/docs/api/core.chat.format.chatml.html
+++ b/docs/api/core.chat.format.chatml.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.chat.format.llama3x.html b/docs/api/core.chat.format.llama3x.html
index 766cf49d2..8475a034c 100644
--- a/docs/api/core.chat.format.llama3x.html
+++ b/docs/api/core.chat.format.llama3x.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.chat.format.shared.html b/docs/api/core.chat.format.shared.html
index aa8258318..0d390adb3 100644
--- a/docs/api/core.chat.format.shared.html
+++ b/docs/api/core.chat.format.shared.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.chat.messages.html b/docs/api/core.chat.messages.html
index 8773e46b4..7a3059f17 100644
--- a/docs/api/core.chat.messages.html
+++ b/docs/api/core.chat.messages.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.datasets.chat.html b/docs/api/core.datasets.chat.html
index 4add2bf6f..56688ae50 100644
--- a/docs/api/core.datasets.chat.html
+++ b/docs/api/core.datasets.chat.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.datasets.transforms.chat_builder.html b/docs/api/core.datasets.transforms.chat_builder.html
index 2ef849156..29f7d26ef 100644
--- a/docs/api/core.datasets.transforms.chat_builder.html
+++ b/docs/api/core.datasets.transforms.chat_builder.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.base.html b/docs/api/core.trainers.base.html
index 5d20eb6f4..e5efd0e5b 100644
--- a/docs/api/core.trainers.base.html
+++ b/docs/api/core.trainers.base.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.dpo.trainer.html b/docs/api/core.trainers.dpo.trainer.html
index 31666ec8a..8e6bbe71b 100644
--- a/docs/api/core.trainers.dpo.trainer.html
+++ b/docs/api/core.trainers.dpo.trainer.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.grpo.sampler.html b/docs/api/core.trainers.grpo.sampler.html
index e0744336a..e38ffd0f9 100644
--- a/docs/api/core.trainers.grpo.sampler.html
+++ b/docs/api/core.trainers.grpo.sampler.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.grpo.trainer.html b/docs/api/core.trainers.grpo.trainer.html
index ce4b997f0..9c0787f83 100644
--- a/docs/api/core.trainers.grpo.trainer.html
+++ b/docs/api/core.trainers.grpo.trainer.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.mamba.html b/docs/api/core.trainers.mamba.html
index 95e8d45b2..f1c201ce6 100644
--- a/docs/api/core.trainers.mamba.html
+++ b/docs/api/core.trainers.mamba.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.mixins.optimizer.html b/docs/api/core.trainers.mixins.optimizer.html
index aba6352a6..efcde3076 100644
--- a/docs/api/core.trainers.mixins.optimizer.html
+++ b/docs/api/core.trainers.mixins.optimizer.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.mixins.rng_state_loader.html b/docs/api/core.trainers.mixins.rng_state_loader.html
index dbdbbf30e..37312fd01 100644
--- a/docs/api/core.trainers.mixins.rng_state_loader.html
+++ b/docs/api/core.trainers.mixins.rng_state_loader.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.mixins.scheduler.html b/docs/api/core.trainers.mixins.scheduler.html
index d6e86b172..49cb8e01a 100644
--- a/docs/api/core.trainers.mixins.scheduler.html
+++ b/docs/api/core.trainers.mixins.scheduler.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.trl.html b/docs/api/core.trainers.trl.html
index 5fb0f667b..18688ef46 100644
--- a/docs/api/core.trainers.trl.html
+++ b/docs/api/core.trainers.trl.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.utils.html b/docs/api/core.trainers.utils.html
index 51dc4a400..fee785e35 100644
--- a/docs/api/core.trainers.utils.html
+++ b/docs/api/core.trainers.utils.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.training_args.html b/docs/api/core.training_args.html
index e183f0c47..0b1f3eb24 100644
--- a/docs/api/core.training_args.html
+++ b/docs/api/core.training_args.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/datasets.html b/docs/api/datasets.html
index 01f844f88..b2aa1ef67 100644
--- a/docs/api/datasets.html
+++ b/docs/api/datasets.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/evaluate.html b/docs/api/evaluate.html
index 1f3bafaa4..8ae61e9cd 100644
--- a/docs/api/evaluate.html
+++ b/docs/api/evaluate.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/index.html b/docs/api/index.html
index c856e59a8..49a80e8d7 100644
--- a/docs/api/index.html
+++ b/docs/api/index.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.base.html b/docs/api/integrations.base.html
index 7e2d21ad1..c50f45d13 100644
--- a/docs/api/integrations.base.html
+++ b/docs/api/integrations.base.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.cut_cross_entropy.args.html b/docs/api/integrations.cut_cross_entropy.args.html
index 8977bc81d..92de47fde 100644
--- a/docs/api/integrations.cut_cross_entropy.args.html
+++ b/docs/api/integrations.cut_cross_entropy.args.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.grokfast.optimizer.html b/docs/api/integrations.grokfast.optimizer.html
index 6d2f1b089..4e09254c9 100644
--- a/docs/api/integrations.grokfast.optimizer.html
+++ b/docs/api/integrations.grokfast.optimizer.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.kd.trainer.html b/docs/api/integrations.kd.trainer.html
index af6d55d44..6e39980ef 100644
--- a/docs/api/integrations.kd.trainer.html
+++ b/docs/api/integrations.kd.trainer.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.liger.args.html b/docs/api/integrations.liger.args.html
index 24658d3b5..f6f37680b 100644
--- a/docs/api/integrations.liger.args.html
+++ b/docs/api/integrations.liger.args.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.lm_eval.args.html b/docs/api/integrations.lm_eval.args.html
index e1def1c7b..80d8e8dfa 100644
--- a/docs/api/integrations.lm_eval.args.html
+++ b/docs/api/integrations.lm_eval.args.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.spectrum.args.html b/docs/api/integrations.spectrum.args.html
index 6ca9b2ff5..27fe13cd3 100644
--- a/docs/api/integrations.spectrum.args.html
+++ b/docs/api/integrations.spectrum.args.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/kernels.geglu.html b/docs/api/kernels.geglu.html
index c3f37b6d4..87f61bac7 100644
--- a/docs/api/kernels.geglu.html
+++ b/docs/api/kernels.geglu.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/kernels.lora.html b/docs/api/kernels.lora.html
index 358cc9e5c..280921d12 100644
--- a/docs/api/kernels.lora.html
+++ b/docs/api/kernels.lora.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/kernels.quantize.html b/docs/api/kernels.quantize.html
index 37caf19c2..1c25e9cf5 100644
--- a/docs/api/kernels.quantize.html
+++ b/docs/api/kernels.quantize.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/kernels.swiglu.html b/docs/api/kernels.swiglu.html
index f5c7cf865..a2927bcc2 100644
--- a/docs/api/kernels.swiglu.html
+++ b/docs/api/kernels.swiglu.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/kernels.utils.html b/docs/api/kernels.utils.html
index 12d577bf8..f2672e9cf 100644
--- a/docs/api/kernels.utils.html
+++ b/docs/api/kernels.utils.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.adapter.html b/docs/api/loaders.adapter.html
index 6b218bde7..14cc6b59e 100644
--- a/docs/api/loaders.adapter.html
+++ b/docs/api/loaders.adapter.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.constants.html b/docs/api/loaders.constants.html
index 73555106f..83d540241 100644
--- a/docs/api/loaders.constants.html
+++ b/docs/api/loaders.constants.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.model.html b/docs/api/loaders.model.html
index 27aecba38..09797e281 100644
--- a/docs/api/loaders.model.html
+++ b/docs/api/loaders.model.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.patch_manager.html b/docs/api/loaders.patch_manager.html
index e8eb34e02..09c561ec9 100644
--- a/docs/api/loaders.patch_manager.html
+++ b/docs/api/loaders.patch_manager.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.processor.html b/docs/api/loaders.processor.html
index 9070cfe8d..c3af02010 100644
--- a/docs/api/loaders.processor.html
+++ b/docs/api/loaders.processor.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.tokenizer.html b/docs/api/loaders.tokenizer.html
index 8db68dfc6..5331d7f70 100644
--- a/docs/api/loaders.tokenizer.html
+++ b/docs/api/loaders.tokenizer.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/logging_config.html b/docs/api/logging_config.html
index 82a42b321..b7e173a93 100644
--- a/docs/api/logging_config.html
+++ b/docs/api/logging_config.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/models.mamba.modeling_mamba.html b/docs/api/models.mamba.modeling_mamba.html
index a4a24469f..73ec3f0f5 100644
--- a/docs/api/models.mamba.modeling_mamba.html
+++ b/docs/api/models.mamba.modeling_mamba.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.btlm_attn_hijack_flash.html b/docs/api/monkeypatch.btlm_attn_hijack_flash.html
index 1d2dca6f7..e745e0c82 100644
--- a/docs/api/monkeypatch.btlm_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.btlm_attn_hijack_flash.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.data.batch_dataset_fetcher.html b/docs/api/monkeypatch.data.batch_dataset_fetcher.html
index 4a0f63d11..11c7d658f 100644
--- a/docs/api/monkeypatch.data.batch_dataset_fetcher.html
+++ b/docs/api/monkeypatch.data.batch_dataset_fetcher.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html b/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
index cfeba9b8c..17587a61b 100644
--- a/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
+++ b/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html b/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
index e24657d33..347eaab94 100644
--- a/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
+++ b/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.llama_attn_hijack_flash.html b/docs/api/monkeypatch.llama_attn_hijack_flash.html
index 79cac55f6..b90b5ffd2 100644
--- a/docs/api/monkeypatch.llama_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.llama_attn_hijack_flash.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.llama_attn_hijack_xformers.html b/docs/api/monkeypatch.llama_attn_hijack_xformers.html
index 7804ce709..9568a6a9d 100644
--- a/docs/api/monkeypatch.llama_attn_hijack_xformers.html
+++ b/docs/api/monkeypatch.llama_attn_hijack_xformers.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.llama_expand_mask.html b/docs/api/monkeypatch.llama_expand_mask.html
index e5e708742..0bdb9feed 100644
--- a/docs/api/monkeypatch.llama_expand_mask.html
+++ b/docs/api/monkeypatch.llama_expand_mask.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.llama_patch_multipack.html b/docs/api/monkeypatch.llama_patch_multipack.html
index ca86df994..106a3092a 100644
--- a/docs/api/monkeypatch.llama_patch_multipack.html
+++ b/docs/api/monkeypatch.llama_patch_multipack.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.lora_kernels.html b/docs/api/monkeypatch.lora_kernels.html
index 9f54457e8..1c61ca937 100644
--- a/docs/api/monkeypatch.lora_kernels.html
+++ b/docs/api/monkeypatch.lora_kernels.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.mistral_attn_hijack_flash.html b/docs/api/monkeypatch.mistral_attn_hijack_flash.html
index 588375e3a..d70bba6d1 100644
--- a/docs/api/monkeypatch.mistral_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.mistral_attn_hijack_flash.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.mixtral.html b/docs/api/monkeypatch.mixtral.html
index 7ffacb09b..1673a116c 100644
--- a/docs/api/monkeypatch.mixtral.html
+++ b/docs/api/monkeypatch.mixtral.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.multipack.html b/docs/api/monkeypatch.multipack.html
index 44d153b9b..dd038ea19 100644
--- a/docs/api/monkeypatch.multipack.html
+++ b/docs/api/monkeypatch.multipack.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.relora.html b/docs/api/monkeypatch.relora.html
index c90a68db3..9fa47d192 100644
--- a/docs/api/monkeypatch.relora.html
+++ b/docs/api/monkeypatch.relora.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.stablelm_attn_hijack_flash.html b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
index bbb992d86..f6111eb69 100644
--- a/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.trainer_fsdp_optim.html b/docs/api/monkeypatch.trainer_fsdp_optim.html
index 36e07b672..20856c258 100644
--- a/docs/api/monkeypatch.trainer_fsdp_optim.html
+++ b/docs/api/monkeypatch.trainer_fsdp_optim.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.transformers_fa_utils.html b/docs/api/monkeypatch.transformers_fa_utils.html
index 5e197ae5c..0576b1b64 100644
--- a/docs/api/monkeypatch.transformers_fa_utils.html
+++ b/docs/api/monkeypatch.transformers_fa_utils.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.unsloth_.html b/docs/api/monkeypatch.unsloth_.html
index aa12d6ed0..28589e5fa 100644
--- a/docs/api/monkeypatch.unsloth_.html
+++ b/docs/api/monkeypatch.unsloth_.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.utils.html b/docs/api/monkeypatch.utils.html
index 97766be2d..24b12a220 100644
--- a/docs/api/monkeypatch.utils.html
+++ b/docs/api/monkeypatch.utils.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.alpaca_chat.html b/docs/api/prompt_strategies.alpaca_chat.html
index e2176f502..279db735d 100644
--- a/docs/api/prompt_strategies.alpaca_chat.html
+++ b/docs/api/prompt_strategies.alpaca_chat.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.alpaca_instruct.html b/docs/api/prompt_strategies.alpaca_instruct.html
index f8157cc7e..8b904031b 100644
--- a/docs/api/prompt_strategies.alpaca_instruct.html
+++ b/docs/api/prompt_strategies.alpaca_instruct.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.alpaca_w_system.html b/docs/api/prompt_strategies.alpaca_w_system.html
index 0448bf1dd..d9c7cc471 100644
--- a/docs/api/prompt_strategies.alpaca_w_system.html
+++ b/docs/api/prompt_strategies.alpaca_w_system.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.base.html b/docs/api/prompt_strategies.base.html
index 052acbacb..66295831d 100644
--- a/docs/api/prompt_strategies.base.html
+++ b/docs/api/prompt_strategies.base.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.bradley_terry.llama3.html b/docs/api/prompt_strategies.bradley_terry.llama3.html
index 11245c3e6..63d8ea8b6 100644
--- a/docs/api/prompt_strategies.bradley_terry.llama3.html
+++ b/docs/api/prompt_strategies.bradley_terry.llama3.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.chat_template.html b/docs/api/prompt_strategies.chat_template.html
index fcd93b025..3c669d401 100644
--- a/docs/api/prompt_strategies.chat_template.html
+++ b/docs/api/prompt_strategies.chat_template.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.completion.html b/docs/api/prompt_strategies.completion.html
index f12a716ff..4e9b6dfda 100644
--- a/docs/api/prompt_strategies.completion.html
+++ b/docs/api/prompt_strategies.completion.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.chat_template.html b/docs/api/prompt_strategies.dpo.chat_template.html
index b4c319c51..ef21616f7 100644
--- a/docs/api/prompt_strategies.dpo.chat_template.html
+++ b/docs/api/prompt_strategies.dpo.chat_template.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.chatml.html b/docs/api/prompt_strategies.dpo.chatml.html
index d9201e431..3303d8ccd 100644
--- a/docs/api/prompt_strategies.dpo.chatml.html
+++ b/docs/api/prompt_strategies.dpo.chatml.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.llama3.html b/docs/api/prompt_strategies.dpo.llama3.html
index 1043cb477..daa60977b 100644
--- a/docs/api/prompt_strategies.dpo.llama3.html
+++ b/docs/api/prompt_strategies.dpo.llama3.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.passthrough.html b/docs/api/prompt_strategies.dpo.passthrough.html
index af553c502..5b6567795 100644
--- a/docs/api/prompt_strategies.dpo.passthrough.html
+++ b/docs/api/prompt_strategies.dpo.passthrough.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.user_defined.html b/docs/api/prompt_strategies.dpo.user_defined.html
index 4ff117df5..50f70d5ef 100644
--- a/docs/api/prompt_strategies.dpo.user_defined.html
+++ b/docs/api/prompt_strategies.dpo.user_defined.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.zephyr.html b/docs/api/prompt_strategies.dpo.zephyr.html
index 3aa1cde72..92f900c99 100644
--- a/docs/api/prompt_strategies.dpo.zephyr.html
+++ b/docs/api/prompt_strategies.dpo.zephyr.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.input_output.html b/docs/api/prompt_strategies.input_output.html
index a0f7a75f1..7c6ea3a4e 100644
--- a/docs/api/prompt_strategies.input_output.html
+++ b/docs/api/prompt_strategies.input_output.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.kto.chatml.html b/docs/api/prompt_strategies.kto.chatml.html
index 90b6a1166..a7d924f9e 100644
--- a/docs/api/prompt_strategies.kto.chatml.html
+++ b/docs/api/prompt_strategies.kto.chatml.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.kto.llama3.html b/docs/api/prompt_strategies.kto.llama3.html
index cb395f44a..3d92ad469 100644
--- a/docs/api/prompt_strategies.kto.llama3.html
+++ b/docs/api/prompt_strategies.kto.llama3.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.kto.user_defined.html b/docs/api/prompt_strategies.kto.user_defined.html
index 2e15b6325..c677db980 100644
--- a/docs/api/prompt_strategies.kto.user_defined.html
+++ b/docs/api/prompt_strategies.kto.user_defined.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.llama2_chat.html b/docs/api/prompt_strategies.llama2_chat.html
index 0bdd3a085..bb82c0546 100644
--- a/docs/api/prompt_strategies.llama2_chat.html
+++ b/docs/api/prompt_strategies.llama2_chat.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.messages.chat.html b/docs/api/prompt_strategies.messages.chat.html
index 86d5d34c5..8b3944242 100644
--- a/docs/api/prompt_strategies.messages.chat.html
+++ b/docs/api/prompt_strategies.messages.chat.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.metharme.html b/docs/api/prompt_strategies.metharme.html
index bfbf9bdc9..d90fddc4e 100644
--- a/docs/api/prompt_strategies.metharme.html
+++ b/docs/api/prompt_strategies.metharme.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.orcamini.html b/docs/api/prompt_strategies.orcamini.html
index 1f6d8b5fa..0573ed3f4 100644
--- a/docs/api/prompt_strategies.orcamini.html
+++ b/docs/api/prompt_strategies.orcamini.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.orpo.chat_template.html b/docs/api/prompt_strategies.orpo.chat_template.html
index 74f0a69f2..9806c2868 100644
--- a/docs/api/prompt_strategies.orpo.chat_template.html
+++ b/docs/api/prompt_strategies.orpo.chat_template.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.pygmalion.html b/docs/api/prompt_strategies.pygmalion.html
index 2c022d660..0cbb75a88 100644
--- a/docs/api/prompt_strategies.pygmalion.html
+++ b/docs/api/prompt_strategies.pygmalion.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.stepwise_supervised.html b/docs/api/prompt_strategies.stepwise_supervised.html
index dfecb472d..53aec3e68 100644
--- a/docs/api/prompt_strategies.stepwise_supervised.html
+++ b/docs/api/prompt_strategies.stepwise_supervised.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.user_defined.html b/docs/api/prompt_strategies.user_defined.html
index 698ea6d11..051122a81 100644
--- a/docs/api/prompt_strategies.user_defined.html
+++ b/docs/api/prompt_strategies.user_defined.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_tokenizers.html b/docs/api/prompt_tokenizers.html
index 374156769..34e7a0656 100644
--- a/docs/api/prompt_tokenizers.html
+++ b/docs/api/prompt_tokenizers.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/train.html b/docs/api/train.html
index 0fad6b69a..730e7ef29 100644
--- a/docs/api/train.html
+++ b/docs/api/train.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.bench.html b/docs/api/utils.bench.html
index ddc62b009..c5532dac9 100644
--- a/docs/api/utils.bench.html
+++ b/docs/api/utils.bench.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.comet_.html b/docs/api/utils.callbacks.comet_.html
index 3d403caae..771a3d58b 100644
--- a/docs/api/utils.callbacks.comet_.html
+++ b/docs/api/utils.callbacks.comet_.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.lisa.html b/docs/api/utils.callbacks.lisa.html
index 2087c60e1..0d2b158b6 100644
--- a/docs/api/utils.callbacks.lisa.html
+++ b/docs/api/utils.callbacks.lisa.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.mlflow_.html b/docs/api/utils.callbacks.mlflow_.html
index 4f61e6053..7ce11b486 100644
--- a/docs/api/utils.callbacks.mlflow_.html
+++ b/docs/api/utils.callbacks.mlflow_.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.perplexity.html b/docs/api/utils.callbacks.perplexity.html
index 6f207f833..9a5dbc039 100644
--- a/docs/api/utils.callbacks.perplexity.html
+++ b/docs/api/utils.callbacks.perplexity.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.profiler.html b/docs/api/utils.callbacks.profiler.html
index 0f3e3d225..19799bf6c 100644
--- a/docs/api/utils.callbacks.profiler.html
+++ b/docs/api/utils.callbacks.profiler.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.qat.html b/docs/api/utils.callbacks.qat.html
index 19deff440..28bd743d6 100644
--- a/docs/api/utils.callbacks.qat.html
+++ b/docs/api/utils.callbacks.qat.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.chat_templates.html b/docs/api/utils.chat_templates.html
index 2e74ad244..87371c0bc 100644
--- a/docs/api/utils.chat_templates.html
+++ b/docs/api/utils.chat_templates.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.collators.batching.html b/docs/api/utils.collators.batching.html
index 0b9dc29aa..c43862578 100644
--- a/docs/api/utils.collators.batching.html
+++ b/docs/api/utils.collators.batching.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.collators.core.html b/docs/api/utils.collators.core.html
index 076d9e3f7..ffa87e7e2 100644
--- a/docs/api/utils.collators.core.html
+++ b/docs/api/utils.collators.core.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.collators.mamba.html b/docs/api/utils.collators.mamba.html
index 7c2434873..4b6a3c7f0 100644
--- a/docs/api/utils.collators.mamba.html
+++ b/docs/api/utils.collators.mamba.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.collators.mm_chat.html b/docs/api/utils.collators.mm_chat.html
index 96dc12a4b..cb7f00804 100644
--- a/docs/api/utils.collators.mm_chat.html
+++ b/docs/api/utils.collators.mm_chat.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.ctx_managers.sequence_parallel.html b/docs/api/utils.ctx_managers.sequence_parallel.html
index 17d6b3d4e..998b431bc 100644
--- a/docs/api/utils.ctx_managers.sequence_parallel.html
+++ b/docs/api/utils.ctx_managers.sequence_parallel.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.data.sft.html b/docs/api/utils.data.sft.html
index 1b543bb6c..dd6df7fb4 100644
--- a/docs/api/utils.data.sft.html
+++ b/docs/api/utils.data.sft.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.data.streaming.html b/docs/api/utils.data.streaming.html
index 8a916443b..4759857c1 100644
--- a/docs/api/utils.data.streaming.html
+++ b/docs/api/utils.data.streaming.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.dict.html b/docs/api/utils.dict.html
index d531c8d14..02b171c02 100644
--- a/docs/api/utils.dict.html
+++ b/docs/api/utils.dict.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.distributed.html b/docs/api/utils.distributed.html
index c62c5f7df..957a467b9 100644
--- a/docs/api/utils.distributed.html
+++ b/docs/api/utils.distributed.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.freeze.html b/docs/api/utils.freeze.html
index cb7b87b6e..fcecad787 100644
--- a/docs/api/utils.freeze.html
+++ b/docs/api/utils.freeze.html
@@ -676,6 +676,12 @@ window.Quarto = {
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.lora.html b/docs/api/utils.lora.html
index 22204ba64..f3e66f014 100644
--- a/docs/api/utils.lora.html
+++ b/docs/api/utils.lora.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.model_shard_quant.html b/docs/api/utils.model_shard_quant.html
index aedb4ff1e..5d876e904 100644
--- a/docs/api/utils.model_shard_quant.html
+++ b/docs/api/utils.model_shard_quant.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.optimizers.adopt.html b/docs/api/utils.optimizers.adopt.html
index 60a99f488..3905c918c 100644
--- a/docs/api/utils.optimizers.adopt.html
+++ b/docs/api/utils.optimizers.adopt.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.quantization.html b/docs/api/utils.quantization.html
index a30a5dd29..bcef2283f 100644
--- a/docs/api/utils.quantization.html
+++ b/docs/api/utils.quantization.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.samplers.multipack.html b/docs/api/utils.samplers.multipack.html
index 8dca0209a..37f819e90 100644
--- a/docs/api/utils.samplers.multipack.html
+++ b/docs/api/utils.samplers.multipack.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schedulers.html b/docs/api/utils.schedulers.html
index 018d709ca..08d566950 100644
--- a/docs/api/utils.schedulers.html
+++ b/docs/api/utils.schedulers.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.config.html b/docs/api/utils.schemas.config.html
index 103bdc638..cc5e972a7 100644
--- a/docs/api/utils.schemas.config.html
+++ b/docs/api/utils.schemas.config.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.datasets.html b/docs/api/utils.schemas.datasets.html
index 4caf462a4..56484c898 100644
--- a/docs/api/utils.schemas.datasets.html
+++ b/docs/api/utils.schemas.datasets.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.enums.html b/docs/api/utils.schemas.enums.html
index 132563559..5cb04e0e7 100644
--- a/docs/api/utils.schemas.enums.html
+++ b/docs/api/utils.schemas.enums.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.integrations.html b/docs/api/utils.schemas.integrations.html
index cc49a7c48..7e95f6599 100644
--- a/docs/api/utils.schemas.integrations.html
+++ b/docs/api/utils.schemas.integrations.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.model.html b/docs/api/utils.schemas.model.html
index 9bb75f4df..b0cf86ec6 100644
--- a/docs/api/utils.schemas.model.html
+++ b/docs/api/utils.schemas.model.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.multimodal.html b/docs/api/utils.schemas.multimodal.html
index 86d2df334..1700866b8 100644
--- a/docs/api/utils.schemas.multimodal.html
+++ b/docs/api/utils.schemas.multimodal.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.peft.html b/docs/api/utils.schemas.peft.html
index ede389a74..5cfea12f9 100644
--- a/docs/api/utils.schemas.peft.html
+++ b/docs/api/utils.schemas.peft.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.training.html b/docs/api/utils.schemas.training.html
index 22845dc8f..b9c29c6f2 100644
--- a/docs/api/utils.schemas.training.html
+++ b/docs/api/utils.schemas.training.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.trl.html b/docs/api/utils.schemas.trl.html
index 5cdd98ccf..ee14b876f 100644
--- a/docs/api/utils.schemas.trl.html
+++ b/docs/api/utils.schemas.trl.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.utils.html b/docs/api/utils.schemas.utils.html
index b1799dfd6..bea008479 100644
--- a/docs/api/utils.schemas.utils.html
+++ b/docs/api/utils.schemas.utils.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.tokenization.html b/docs/api/utils.tokenization.html
index 56c1668b2..d841ef78b 100644
--- a/docs/api/utils.tokenization.html
+++ b/docs/api/utils.tokenization.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.trainer.html b/docs/api/utils.trainer.html
index 3447af9a6..bd5954a55 100644
--- a/docs/api/utils.trainer.html
+++ b/docs/api/utils.trainer.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/attention.html b/docs/attention.html
new file mode 100644
index 000000000..64421639d
--- /dev/null
+++ b/docs/attention.html
@@ -0,0 +1,1351 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.8.27">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+<meta name="description" content="Supported attention modules in Axolotl">
+
+<title>Attention – Axolotl</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+html { -webkit-text-size-adjust: 100%; }
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../favicon.jpg" rel="icon" type="image/jpeg">
+<script src="../site_libs/quarto-html/quarto.js" type="module"></script>
+<script src="../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
+<script src="../site_libs/quarto-html/axe/axe-check.js" type="module"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting-dark-4d9afe2b8d18ee9fa5d0d57b5ed4214d.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap-08d9eb451d58809f35fda8b852d737d8.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
+<script id="quarto-search-options" type="application/json">{
+  "location": "navbar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "end",
+  "type": "overlay",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "show-item-context": false,
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-9KYCVJBNMQ"></script>
+
+<script type="text/javascript">
+
+window.dataLayer = window.dataLayer || [];
+function gtag(){dataLayer.push(arguments);}
+gtag('js', new Date());
+gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
+</script>
+
+
+<link rel="stylesheet" href="../styles.css">
+</head>
+
+<body class="nav-sidebar docked nav-fixed quarto-light">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+    <nav class="navbar navbar-expand " data-bs-theme="dark">
+      <div class="navbar-container container-fluid">
+      <div class="navbar-brand-container mx-auto">
+    <a href="../index.html" class="navbar-brand navbar-brand-logo">
+    <img src="../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo light-content">
+    <img src="../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo dark-content">
+    </a>
+  </div>
+        <div class="quarto-navbar-tools tools-wide tools-end">
+    <a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
+    <a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
+</div>
+          <div id="quarto-search" class="" title="Search"></div>
+      </div> <!-- /container-fluid -->
+    </nav>
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/batch_vs_grad.html">Core Concepts</a></li><li class="breadcrumb-item"><a href="../docs/attention.html">Attention</a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Home</span></a>
+  </div>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
+ <span class="menu-text">Getting Started</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/getting-started.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Quickstart</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/installation.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Installation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/inference.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Inference and Merging</span></a>
+  </div>
+</li>
+          <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false">
+ <span class="menu-text">Model Guides</span></a>
+          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth2 ">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/kimi-linear.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Kimi Linear</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/plano.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Plano Orchestrator</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/mimo.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">MiMo</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/internvl3_5.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">InternVL 3.5</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/olmo3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">OLMo 3</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/trinity.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Trinity</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/arcee.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Arcee AFM</span></a>
+  </div>
+</li>
+          <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false">
+ <span class="menu-text">Ministral3</span></a>
+          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth3 ">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/ministral3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ministral3</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/ministral3/think.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ministral 3 Thinking</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/ministral3/vision.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ministral 3 Vision</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+          <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false">
+ <span class="menu-text">Magistral</span></a>
+          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth3 ">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/magistral.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Magistral</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/magistral/think.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Magistral Thinking</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/magistral/vision.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Magistral Vision</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/ministral.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ministral</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/mistral-small.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mistral Small 3.1/3.2</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/voxtral.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Voxtral</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/devstral.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Devstral</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/mistral.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mistral 7B</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/llama-4.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Llama 4</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/llama-2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Llama 2</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/qwen3-next.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Qwen 3 Next</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/qwen3.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Qwen 3</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/gemma3n.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Gemma 3n</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/apertus.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Apertus</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/gpt-oss.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">GPT-OSS</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/seed-oss.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Seed-OSS</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/phi.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Phi</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/smolvlm2.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">SmolVLM 2</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/granite4.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Granite 4</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/LiquidAI.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Liquid Foundation Models 2</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/hunyuan.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Hunyuan</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/jamba.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Jamba</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/models/orpheus.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Orpheus</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/cli.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Command Line Interface (CLI)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/telemetry.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Telemetry</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/config-reference.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Config Reference</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/api" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">API Reference</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Formats</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Pre-training</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Instruction Tuning</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Conversation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Stepwise Supervised Format</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Template-Free</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
+ <span class="menu-text">Deployments</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/docker.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Docker</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multi-gpu.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multi-GPU</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multi-node.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multi Node</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">AMD GPUs on HPC Systems</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/mac.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mac M-series</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true">
+ <span class="menu-text">How To Guides</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-7" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multimodal.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">MultiModal / Vision Language Models (BETA)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/rlhf.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">RLHF (Beta)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/reward_modelling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Reward Modelling</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/lr_groups.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Learning Rate Groups</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/lora_optims.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">LoRA Optimizations</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset_loading.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Loading</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/qat.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Quantization Aware Training (QAT)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/quantize.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Quantization with torchao</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/optimizations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Optimizations Guide</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true">
+ <span class="menu-text">Core Concepts</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-8" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/batch_vs_grad.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Batch size vs Gradient accumulation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Preprocessing</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multipack.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multipack (Sample Packing)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/mixed_precision.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mixed Precision Training</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Optimizers</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text">Attention</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true">
+ <span class="menu-text">Advanced Features</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-9" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">FSDP + QLoRA</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Unsloth</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/torchao.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">PyTorch ao</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/custom_integrations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Custom Integrations</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/sequence_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Sequence Parallelism</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism (Beta)</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true">
+ <span class="menu-text">Troubleshooting</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-10" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/faq.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">FAQ</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/debugging.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Debugging</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nccl.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">NCCL</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">On this page</h2>
+   
+  <ul>
+  <li><a href="#sdp-attention" id="toc-sdp-attention" class="nav-link active" data-scroll-target="#sdp-attention">SDP Attention</a></li>
+  <li><a href="#flash-attention-2" id="toc-flash-attention-2" class="nav-link" data-scroll-target="#flash-attention-2">Flash Attention 2</a>
+  <ul class="collapse">
+  <li><a href="#nvidia" id="toc-nvidia" class="nav-link" data-scroll-target="#nvidia">Nvidia</a></li>
+  <li><a href="#amd" id="toc-amd" class="nav-link" data-scroll-target="#amd">AMD</a></li>
+  </ul></li>
+  <li><a href="#flex-attention" id="toc-flex-attention" class="nav-link" data-scroll-target="#flex-attention">Flex Attention</a></li>
+  <li><a href="#sageattention" id="toc-sageattention" class="nav-link" data-scroll-target="#sageattention">SageAttention</a></li>
+  <li><a href="#xformers" id="toc-xformers" class="nav-link" data-scroll-target="#xformers">xFormers</a></li>
+  <li><a href="#shifted-sparse-attention" id="toc-shifted-sparse-attention" class="nav-link" data-scroll-target="#shifted-sparse-attention">Shifted Sparse Attention</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/batch_vs_grad.html">Core Concepts</a></li><li class="breadcrumb-item"><a href="../docs/attention.html">Attention</a></li></ol></nav>
+<div class="quarto-title">
+<h1 class="title">Attention</h1>
+</div>
+
+<div>
+  <div class="description">
+    Supported attention modules in Axolotl
+  </div>
+</div>
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<section id="sdp-attention" class="level2">
+<h2 class="anchored" data-anchor-id="sdp-attention">SDP Attention</h2>
+<p>This is the default built-in attention in PyTorch.</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">sdp_attention</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<p>For more details: <a href="https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html">PyTorch docs</a></p>
+</section>
+<section id="flash-attention-2" class="level2">
+<h2 class="anchored" data-anchor-id="flash-attention-2">Flash Attention 2</h2>
+<p>Uses efficient kernels to compute attention.</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attention</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<p>For more details: <a href="https://github.com/Dao-AILab/flash-attention/">Flash Attention</a></p>
+<section id="nvidia" class="level3">
+<h3 class="anchored" data-anchor-id="nvidia">Nvidia</h3>
+<p>Requirements: Ampere, Ada, or Hopper GPUs</p>
+<p>Note: For Turing GPUs or lower, please use other attention methods.</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip</span> install flash-attn <span class="at">--no-build-isolation</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>If you get <code>undefined symbol</code> while training, ensure you installed PyTorch prior to Axolotl. Alternatively, try reinstall or downgrade a version.</p>
+</div>
+</div>
+<section id="flash-attention-3" class="level4">
+<h4 class="anchored" data-anchor-id="flash-attention-3">Flash Attention 3</h4>
+<p>Requirements: Hopper only and CUDA 12.8 (recommended)</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> clone https://github.com/Dao-AILab/flash-attention.git</span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> flash-attention/hopper</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> setup.py install</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+</section>
+</section>
+<section id="amd" class="level3">
+<h3 class="anchored" data-anchor-id="amd">AMD</h3>
+<p>Requirements: ROCm 6.0 and above.</p>
+<p>See <a href="https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support">Flash Attention AMD docs</a>.</p>
+</section>
+</section>
+<section id="flex-attention" class="level2">
+<h2 class="anchored" data-anchor-id="flex-attention">Flex Attention</h2>
+<p>A flexible PyTorch API for attention used in combination with <code>torch.compile</code>.</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">flex_attention</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="co"># recommended</span></span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="callout callout-style-default callout-note callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Note
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>We recommend using latest stable version of PyTorch for best performance.</p>
+</div>
+</div>
+<p>For more details: <a href="https://pytorch.org/blog/flexattention/">PyTorch docs</a></p>
+</section>
+<section id="sageattention" class="level2">
+<h2 class="anchored" data-anchor-id="sageattention">SageAttention</h2>
+<p>Attention kernels with QK Int8 and PV FP16 accumulator.</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb6"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">sage_attention</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<p>Requirements: Ampere, Ada, or Hopper GPUs</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip</span> install sageattention==2.2.0 <span class="at">--no-build-isolation</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="callout callout-style-default callout-warning callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Warning
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Only LoRA/QLoRA recommended at the moment. We found loss drop to 0 for full finetuning. See <a href="https://github.com/thu-ml/SageAttention/issues/198">GitHub Issue</a>.</p>
+</div>
+</div>
+<p>For more details: <a href="https://github.com/thu-ml/SageAttention">Sage Attention</a></p>
+<div class="callout callout-style-default callout-note callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Note
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>We do not support SageAttention 3 at the moment. If you are interested on adding this or improving SageAttention implementation, please make an Issue.</p>
+</div>
+</div>
+</section>
+<section id="xformers" class="level2">
+<h2 class="anchored" data-anchor-id="xformers">xFormers</h2>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb8"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="fu">xformers_attention</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>We recommend using with Turing GPUs or below (such as on Colab).</p>
+</div>
+</div>
+<p>For more details: <a href="https://github.com/facebookresearch/xformers">xFormers</a></p>
+</section>
+<section id="shifted-sparse-attention" class="level2">
+<h2 class="anchored" data-anchor-id="shifted-sparse-attention">Shifted Sparse Attention</h2>
+<div class="callout callout-style-default callout-warning callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Warning
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>We plan to deprecate this! If you use this feature, we recommend switching to methods above.</p>
+</div>
+</div>
+<p>Requirements: LLaMA model architecture</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb9"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attention</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="fu">s2_attention</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>No sample packing support!</p>
+</div>
+</div>
+
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+  window.document.addEventListener("DOMContentLoaded", function (event) {
+    const icon = "";
+    const anchorJS = new window.AnchorJS();
+    anchorJS.options = {
+      placement: 'right',
+      icon: icon
+    };
+    anchorJS.add('.anchored');
+    const isCodeAnnotation = (el) => {
+      for (const clz of el.classList) {
+        if (clz.startsWith('code-annotation-')) {                     
+          return true;
+        }
+      }
+      return false;
+    }
+    const onCopySuccess = function(e) {
+      // button target
+      const button = e.trigger;
+      // don't keep focus
+      button.blur();
+      // flash "checked"
+      button.classList.add('code-copy-button-checked');
+      var currentTitle = button.getAttribute("title");
+      button.setAttribute("title", "Copied!");
+      let tooltip;
+      if (window.bootstrap) {
+        button.setAttribute("data-bs-toggle", "tooltip");
+        button.setAttribute("data-bs-placement", "left");
+        button.setAttribute("data-bs-title", "Copied!");
+        tooltip = new bootstrap.Tooltip(button, 
+          { trigger: "manual", 
+            customClass: "code-copy-button-tooltip",
+            offset: [0, -8]});
+        tooltip.show();    
+      }
+      setTimeout(function() {
+        if (tooltip) {
+          tooltip.hide();
+          button.removeAttribute("data-bs-title");
+          button.removeAttribute("data-bs-toggle");
+          button.removeAttribute("data-bs-placement");
+        }
+        button.setAttribute("title", currentTitle);
+        button.classList.remove('code-copy-button-checked');
+      }, 1000);
+      // clear code selection
+      e.clearSelection();
+    }
+    const getTextToCopy = function(trigger) {
+      const outerScaffold = trigger.parentElement.cloneNode(true);
+      const codeEl = outerScaffold.querySelector('code');
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+    }
+    const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+      text: getTextToCopy
+    });
+    clipboard.on('success', onCopySuccess);
+    if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+      const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+        text: getTextToCopy,
+        container: window.document.getElementById('quarto-embedded-source-code-modal')
+      });
+      clipboardModal.on('success', onCopySuccess);
+    }
+      var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+      var mailtoRegex = new RegExp(/^mailto:/);
+        var filterRegex = new RegExp("https:\/\/docs\.axolotl\.ai");
+      var isInternal = (href) => {
+          return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+      }
+      // Inspect non-navigation links and adorn them if external
+     var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+      for (var i=0; i<links.length; i++) {
+        const link = links[i];
+        if (!isInternal(link.href)) {
+          // undo the damage that might have been done by quarto-nav.js in the case of
+          // links that we want to consider external
+          if (link.dataset.originalHref !== undefined) {
+            link.href = link.dataset.originalHref;
+          }
+        }
+      }
+    function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+      const config = {
+        allowHTML: true,
+        maxWidth: 500,
+        delay: 100,
+        arrow: false,
+        appendTo: function(el) {
+            return el.parentElement;
+        },
+        interactive: true,
+        interactiveBorder: 10,
+        theme: 'quarto',
+        placement: 'bottom-start',
+      };
+      if (contentFn) {
+        config.content = contentFn;
+      }
+      if (onTriggerFn) {
+        config.onTrigger = onTriggerFn;
+      }
+      if (onUntriggerFn) {
+        config.onUntrigger = onUntriggerFn;
+      }
+      window.tippy(el, config); 
+    }
+    const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+    for (var i=0; i<noterefs.length; i++) {
+      const ref = noterefs[i];
+      tippyHover(ref, function() {
+        // use id or data attribute instead here
+        let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+        try { href = new URL(href).hash; } catch {}
+        const id = href.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note) {
+          return note.innerHTML;
+        } else {
+          return "";
+        }
+      });
+    }
+    const xrefs = window.document.querySelectorAll('a.quarto-xref');
+    const processXRef = (id, note) => {
+      // Strip column container classes
+      const stripColumnClz = (el) => {
+        el.classList.remove("page-full", "page-columns");
+        if (el.children) {
+          for (const child of el.children) {
+            stripColumnClz(child);
+          }
+        }
+      }
+      stripColumnClz(note)
+      if (id === null || id.startsWith('sec-')) {
+        // Special case sections, only their first couple elements
+        const container = document.createElement("div");
+        if (note.children && note.children.length > 2) {
+          container.appendChild(note.children[0].cloneNode(true));
+          for (let i = 1; i < note.children.length; i++) {
+            const child = note.children[i];
+            if (child.tagName === "P" && child.innerText === "") {
+              continue;
+            } else {
+              container.appendChild(child.cloneNode(true));
+              break;
+            }
+          }
+          if (window.Quarto?.typesetMath) {
+            window.Quarto.typesetMath(container);
+          }
+          return container.innerHTML
+        } else {
+          if (window.Quarto?.typesetMath) {
+            window.Quarto.typesetMath(note);
+          }
+          return note.innerHTML;
+        }
+      } else {
+        // Remove any anchor links if they are present
+        const anchorLink = note.querySelector('a.anchorjs-link');
+        if (anchorLink) {
+          anchorLink.remove();
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        if (note.classList.contains("callout")) {
+          return note.outerHTML;
+        } else {
+          return note.innerHTML;
+        }
+      }
+    }
+    for (var i=0; i<xrefs.length; i++) {
+      const xref = xrefs[i];
+      tippyHover(xref, undefined, function(instance) {
+        instance.disable();
+        let url = xref.getAttribute('href');
+        let hash = undefined; 
+        if (url.startsWith('#')) {
+          hash = url;
+        } else {
+          try { hash = new URL(url).hash; } catch {}
+        }
+        if (hash) {
+          const id = hash.replace(/^#\/?/, "");
+          const note = window.document.getElementById(id);
+          if (note !== null) {
+            try {
+              const html = processXRef(id, note.cloneNode(true));
+              instance.setContent(html);
+            } finally {
+              instance.enable();
+              instance.show();
+            }
+          } else {
+            // See if we can fetch this
+            fetch(url.split('#')[0])
+            .then(res => res.text())
+            .then(html => {
+              const parser = new DOMParser();
+              const htmlDoc = parser.parseFromString(html, "text/html");
+              const note = htmlDoc.getElementById(id);
+              if (note !== null) {
+                const html = processXRef(id, note);
+                instance.setContent(html);
+              } 
+            }).finally(() => {
+              instance.enable();
+              instance.show();
+            });
+          }
+        } else {
+          // See if we can fetch a full url (with no hash to target)
+          // This is a special case and we should probably do some content thinning / targeting
+          fetch(url)
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.querySelector('main.content');
+            if (note !== null) {
+              // This should only happen for chapter cross references
+              // (since there is no id in the URL)
+              // remove the first header
+              if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+                note.children[0].remove();
+              }
+              const html = processXRef(null, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      }, function(instance) {
+      });
+    }
+        let selectedAnnoteEl;
+        const selectorForAnnotation = ( cell, annotation) => {
+          let cellAttr = 'data-code-cell="' + cell + '"';
+          let lineAttr = 'data-code-annotation="' +  annotation + '"';
+          const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+          return selector;
+        }
+        const selectCodeLines = (annoteEl) => {
+          const doc = window.document;
+          const targetCell = annoteEl.getAttribute("data-target-cell");
+          const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+          const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+          const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+          const lineIds = lines.map((line) => {
+            return targetCell + "-" + line;
+          })
+          let top = null;
+          let height = null;
+          let parent = null;
+          if (lineIds.length > 0) {
+              //compute the position of the single el (top and bottom and make a div)
+              const el = window.document.getElementById(lineIds[0]);
+              top = el.offsetTop;
+              height = el.offsetHeight;
+              parent = el.parentElement.parentElement;
+            if (lineIds.length > 1) {
+              const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+              const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+              height = bottom - top;
+            }
+            if (top !== null && height !== null && parent !== null) {
+              // cook up a div (if necessary) and position it 
+              let div = window.document.getElementById("code-annotation-line-highlight");
+              if (div === null) {
+                div = window.document.createElement("div");
+                div.setAttribute("id", "code-annotation-line-highlight");
+                div.style.position = 'absolute';
+                parent.appendChild(div);
+              }
+              div.style.top = top - 2 + "px";
+              div.style.height = height + 4 + "px";
+              div.style.left = 0;
+              let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+              if (gutterDiv === null) {
+                gutterDiv = window.document.createElement("div");
+                gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+                gutterDiv.style.position = 'absolute';
+                const codeCell = window.document.getElementById(targetCell);
+                const gutter = codeCell.querySelector('.code-annotation-gutter');
+                gutter.appendChild(gutterDiv);
+              }
+              gutterDiv.style.top = top - 2 + "px";
+              gutterDiv.style.height = height + 4 + "px";
+            }
+            selectedAnnoteEl = annoteEl;
+          }
+        };
+        const unselectCodeLines = () => {
+          const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+          elementsIds.forEach((elId) => {
+            const div = window.document.getElementById(elId);
+            if (div) {
+              div.remove();
+            }
+          });
+          selectedAnnoteEl = undefined;
+        };
+          // Handle positioning of the toggle
+      window.addEventListener(
+        "resize",
+        throttle(() => {
+          elRect = undefined;
+          if (selectedAnnoteEl) {
+            selectCodeLines(selectedAnnoteEl);
+          }
+        }, 10)
+      );
+      function throttle(fn, ms) {
+      let throttle = false;
+      let timer;
+        return (...args) => {
+          if(!throttle) { // first call gets through
+              fn.apply(this, args);
+              throttle = true;
+          } else { // all the others get throttled
+              if(timer) clearTimeout(timer); // cancel #2
+              timer = setTimeout(() => {
+                fn.apply(this, args);
+                timer = throttle = false;
+              }, ms);
+          }
+        };
+      }
+        // Attach click handler to the DT
+        const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+        for (const annoteDlNode of annoteDls) {
+          annoteDlNode.addEventListener('click', (event) => {
+            const clickedEl = event.target;
+            if (clickedEl !== selectedAnnoteEl) {
+              unselectCodeLines();
+              const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+              if (activeEl) {
+                activeEl.classList.remove('code-annotation-active');
+              }
+              selectCodeLines(clickedEl);
+              clickedEl.classList.add('code-annotation-active');
+            } else {
+              // Unselect the line
+              unselectCodeLines();
+              clickedEl.classList.remove('code-annotation-active');
+            }
+          });
+        }
+    const findCites = (el) => {
+      const parentEl = el.parentElement;
+      if (parentEl) {
+        const cites = parentEl.dataset.cites;
+        if (cites) {
+          return {
+            el,
+            cites: cites.split(' ')
+          };
+        } else {
+          return findCites(el.parentElement)
+        }
+      } else {
+        return undefined;
+      }
+    };
+    var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+    for (var i=0; i<bibliorefs.length; i++) {
+      const ref = bibliorefs[i];
+      const citeInfo = findCites(ref);
+      if (citeInfo) {
+        tippyHover(citeInfo.el, function() {
+          var popup = window.document.createElement('div');
+          citeInfo.cites.forEach(function(cite) {
+            var citeDiv = window.document.createElement('div');
+            citeDiv.classList.add('hanging-indent');
+            citeDiv.classList.add('csl-entry');
+            var biblioDiv = window.document.getElementById('ref-' + cite);
+            if (biblioDiv) {
+              citeDiv.innerHTML = biblioDiv.innerHTML;
+            }
+            popup.appendChild(citeDiv);
+          });
+          return popup.innerHTML;
+        });
+      }
+    }
+  });
+  </script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/batch_vs_grad.html b/docs/batch_vs_grad.html
index bdd715731..92a6ca896 100644
--- a/docs/batch_vs_grad.html
+++ b/docs/batch_vs_grad.html
@@ -613,6 +613,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/checkpoint_saving.html b/docs/checkpoint_saving.html
index 70a558984..cb18ca77a 100644
--- a/docs/checkpoint_saving.html
+++ b/docs/checkpoint_saving.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/cli.html b/docs/cli.html
index f9673f501..6e0983219 100644
--- a/docs/cli.html
+++ b/docs/cli.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/config-reference.html b/docs/config-reference.html
index ce734911a..35245f56f 100644
--- a/docs/config-reference.html
+++ b/docs/config-reference.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
@@ -1591,626 +1597,630 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb1-821"><a href="#cb1-821" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_fuse_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-822"><a href="#cb1-822" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use bettertransformers</span></span>
 <span id="cb1-823"><a href="#cb1-823" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_optimum</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-824"><a href="#cb1-824" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-825"><a href="#cb1-825" aria-hidden="true" tabindex="-1"></a><span class="fu">eager_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-824"><a href="#cb1-824" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use SageAttention https://github.com/thu-ml/SageAttention</span></span>
+<span id="cb1-825"><a href="#cb1-825" aria-hidden="true" tabindex="-1"></a><span class="fu">sage_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-826"><a href="#cb1-826" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-827"><a href="#cb1-827" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a custom attention implementation, used mostly for kernels.</span></span>
-<span id="cb1-828"><a href="#cb1-828" aria-hidden="true" tabindex="-1"></a><span class="fu">attn_implementation</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-829"><a href="#cb1-829" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-830"><a href="#cb1-830" aria-hidden="true" tabindex="-1"></a><span class="co"># Which experts implementation to use for MoE models,</span></span>
-<span id="cb1-831"><a href="#cb1-831" aria-hidden="true" tabindex="-1"></a><span class="fu">experts_implementation</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-832"><a href="#cb1-832" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-833"><a href="#cb1-833" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399</span></span>
-<span id="cb1-834"><a href="#cb1-834" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-835"><a href="#cb1-835" aria-hidden="true" tabindex="-1"></a><span class="co"># Scaling factor for SSMax attention. Default is 0.43</span></span>
-<span id="cb1-836"><a href="#cb1-836" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-837"><a href="#cb1-837" aria-hidden="true" tabindex="-1"></a><span class="co"># Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better</span></span>
-<span id="cb1-838"><a href="#cb1-838" aria-hidden="true" tabindex="-1"></a><span class="co"># length generalization.</span></span>
-<span id="cb1-839"><a href="#cb1-839" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax_bias</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-840"><a href="#cb1-840" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-841"><a href="#cb1-841" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_cross_entropy_loss</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-842"><a href="#cb1-842" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-843"><a href="#cb1-843" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_qkv</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-844"><a href="#cb1-844" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_o</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-845"><a href="#cb1-845" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rms_norm</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-846"><a href="#cb1-846" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rope</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-847"><a href="#cb1-847" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-848"><a href="#cb1-848" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-849"><a href="#cb1-849" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-850"><a href="#cb1-850" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_mlp_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-851"><a href="#cb1-851" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-852"><a href="#cb1-852" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-853"><a href="#cb1-853" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_qkv_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-854"><a href="#cb1-854" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-855"><a href="#cb1-855" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-856"><a href="#cb1-856" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_o_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-857"><a href="#cb1-857" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-858"><a href="#cb1-858" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use chunked cross entropy loss for memory efficiency</span></span>
-<span id="cb1-859"><a href="#cb1-859" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-860"><a href="#cb1-860" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of chunks to use for chunked cross entropy loss</span></span>
-<span id="cb1-861"><a href="#cb1-861" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-862"><a href="#cb1-862" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable Entropy-Aware Focal Training loss (EAFT)</span></span>
-<span id="cb1-863"><a href="#cb1-863" aria-hidden="true" tabindex="-1"></a><span class="fu">use_eaft</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-864"><a href="#cb1-864" aria-hidden="true" tabindex="-1"></a><span class="co"># Exponent for entropy weighting in EAFT (default: 1.0)</span></span>
-<span id="cb1-865"><a href="#cb1-865" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_alpha</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
-<span id="cb1-866"><a href="#cb1-866" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of top logits for entropy approximation (default: 20)</span></span>
-<span id="cb1-867"><a href="#cb1-867" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_k</span><span class="kw">:</span><span class="at"> int | None = 20</span></span>
-<span id="cb1-868"><a href="#cb1-868" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-869"><a href="#cb1-869" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ALST tiled mlp for memory efficient long context</span></span>
-<span id="cb1-870"><a href="#cb1-870" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-871"><a href="#cb1-871" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-872"><a href="#cb1-872" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of shards to use for ALST tiled mlp. If unset, it will be set based on</span></span>
-<span id="cb1-873"><a href="#cb1-873" aria-hidden="true" tabindex="-1"></a><span class="co"># seqlen/hidden_size</span></span>
-<span id="cb1-874"><a href="#cb1-874" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_num_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-875"><a href="#cb1-875" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-876"><a href="#cb1-876" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on</span></span>
-<span id="cb1-877"><a href="#cb1-877" aria-hidden="true" tabindex="-1"></a><span class="co"># llama.</span></span>
-<span id="cb1-878"><a href="#cb1-878" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_use_original_mlp</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-879"><a href="#cb1-879" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-880"><a href="#cb1-880" aria-hidden="true" tabindex="-1"></a><span class="fu">llama4_linearized_experts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-827"><a href="#cb1-827" aria-hidden="true" tabindex="-1"></a><span class="fu">eager_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-828"><a href="#cb1-828" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-829"><a href="#cb1-829" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a custom attention implementation, used mostly for kernels.</span></span>
+<span id="cb1-830"><a href="#cb1-830" aria-hidden="true" tabindex="-1"></a><span class="fu">attn_implementation</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-831"><a href="#cb1-831" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-832"><a href="#cb1-832" aria-hidden="true" tabindex="-1"></a><span class="co"># Which experts implementation to use for MoE models,</span></span>
+<span id="cb1-833"><a href="#cb1-833" aria-hidden="true" tabindex="-1"></a><span class="fu">experts_implementation</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-834"><a href="#cb1-834" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-835"><a href="#cb1-835" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399</span></span>
+<span id="cb1-836"><a href="#cb1-836" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-837"><a href="#cb1-837" aria-hidden="true" tabindex="-1"></a><span class="co"># Scaling factor for SSMax attention. Default is 0.43</span></span>
+<span id="cb1-838"><a href="#cb1-838" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-839"><a href="#cb1-839" aria-hidden="true" tabindex="-1"></a><span class="co"># Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better</span></span>
+<span id="cb1-840"><a href="#cb1-840" aria-hidden="true" tabindex="-1"></a><span class="co"># length generalization.</span></span>
+<span id="cb1-841"><a href="#cb1-841" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax_bias</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-842"><a href="#cb1-842" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-843"><a href="#cb1-843" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_cross_entropy_loss</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-844"><a href="#cb1-844" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-845"><a href="#cb1-845" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_qkv</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-846"><a href="#cb1-846" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_o</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-847"><a href="#cb1-847" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rms_norm</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-848"><a href="#cb1-848" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rope</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-849"><a href="#cb1-849" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-850"><a href="#cb1-850" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-851"><a href="#cb1-851" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-852"><a href="#cb1-852" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_mlp_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-853"><a href="#cb1-853" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-854"><a href="#cb1-854" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-855"><a href="#cb1-855" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_qkv_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-856"><a href="#cb1-856" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-857"><a href="#cb1-857" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-858"><a href="#cb1-858" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_o_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-859"><a href="#cb1-859" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-860"><a href="#cb1-860" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use chunked cross entropy loss for memory efficiency</span></span>
+<span id="cb1-861"><a href="#cb1-861" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-862"><a href="#cb1-862" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of chunks to use for chunked cross entropy loss</span></span>
+<span id="cb1-863"><a href="#cb1-863" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-864"><a href="#cb1-864" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable Entropy-Aware Focal Training loss (EAFT)</span></span>
+<span id="cb1-865"><a href="#cb1-865" aria-hidden="true" tabindex="-1"></a><span class="fu">use_eaft</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-866"><a href="#cb1-866" aria-hidden="true" tabindex="-1"></a><span class="co"># Exponent for entropy weighting in EAFT (default: 1.0)</span></span>
+<span id="cb1-867"><a href="#cb1-867" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_alpha</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
+<span id="cb1-868"><a href="#cb1-868" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of top logits for entropy approximation (default: 20)</span></span>
+<span id="cb1-869"><a href="#cb1-869" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_k</span><span class="kw">:</span><span class="at"> int | None = 20</span></span>
+<span id="cb1-870"><a href="#cb1-870" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-871"><a href="#cb1-871" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ALST tiled mlp for memory efficient long context</span></span>
+<span id="cb1-872"><a href="#cb1-872" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-873"><a href="#cb1-873" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-874"><a href="#cb1-874" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of shards to use for ALST tiled mlp. If unset, it will be set based on</span></span>
+<span id="cb1-875"><a href="#cb1-875" aria-hidden="true" tabindex="-1"></a><span class="co"># seqlen/hidden_size</span></span>
+<span id="cb1-876"><a href="#cb1-876" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_num_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-877"><a href="#cb1-877" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-878"><a href="#cb1-878" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on</span></span>
+<span id="cb1-879"><a href="#cb1-879" aria-hidden="true" tabindex="-1"></a><span class="co"># llama.</span></span>
+<span id="cb1-880"><a href="#cb1-880" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_use_original_mlp</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
 <span id="cb1-881"><a href="#cb1-881" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-882"><a href="#cb1-882" aria-hidden="true" tabindex="-1"></a><span class="co"># Deepspeed config path. e.g., deepspeed_configs/zero3.json</span></span>
-<span id="cb1-883"><a href="#cb1-883" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
-<span id="cb1-884"><a href="#cb1-884" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use deepcompile for faster training with deepspeed</span></span>
-<span id="cb1-885"><a href="#cb1-885" aria-hidden="true" tabindex="-1"></a><span class="fu">deepcompile</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-886"><a href="#cb1-886" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration</span></span>
-<span id="cb1-887"><a href="#cb1-887" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-888"><a href="#cb1-888" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-889"><a href="#cb1-889" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration options</span></span>
-<span id="cb1-890"><a href="#cb1-890" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span><span class="at"> FSDPConfig | None</span></span>
-<span id="cb1-891"><a href="#cb1-891" aria-hidden="true" tabindex="-1"></a><span class="co">  # For FSDPConfig:</span></span>
-<span id="cb1-892"><a href="#cb1-892" aria-hidden="true" tabindex="-1"></a><span class="co">  # FSDP version</span></span>
-<span id="cb1-893"><a href="#cb1-893" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-894"><a href="#cb1-894" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable activation checkpointing to reduce memory usage during forward passes</span></span>
-<span id="cb1-895"><a href="#cb1-895" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">activation_checkpointing</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-896"><a href="#cb1-896" aria-hidden="true" tabindex="-1"></a><span class="co">  # Offload parameters to CPU to reduce GPU memory usage</span></span>
-<span id="cb1-897"><a href="#cb1-897" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">offload_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-898"><a href="#cb1-898" aria-hidden="true" tabindex="-1"></a><span class="co">  # Synchronize module states across all processes</span></span>
-<span id="cb1-899"><a href="#cb1-899" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">sync_module_states</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-900"><a href="#cb1-900" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable CPU RAM efficient loading to reduce memory usage during model loading</span></span>
-<span id="cb1-901"><a href="#cb1-901" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-902"><a href="#cb1-902" aria-hidden="true" tabindex="-1"></a><span class="co">  # Disabling this enables swap memory usage for resource-constrained setups when</span></span>
-<span id="cb1-903"><a href="#cb1-903" aria-hidden="true" tabindex="-1"></a><span class="co">  # offload_params is enabled.</span></span>
-<span id="cb1-904"><a href="#cb1-904" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_offload_pin_memory</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-905"><a href="#cb1-905" aria-hidden="true" tabindex="-1"></a><span class="co">  # Use original parameters instead of flattened parameters</span></span>
-<span id="cb1-906"><a href="#cb1-906" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">use_orig_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-907"><a href="#cb1-907" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-908"><a href="#cb1-908" aria-hidden="true" tabindex="-1"></a><span class="co">  # Type of state dict to use for saving/loading checkpoints</span></span>
-<span id="cb1-909"><a href="#cb1-909" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
-<span id="cb1-910"><a href="#cb1-910" aria-hidden="true" tabindex="-1"></a><span class="co">  # Final state dict type to use after training completion</span></span>
-<span id="cb1-911"><a href="#cb1-911" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
-<span id="cb1-912"><a href="#cb1-912" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-913"><a href="#cb1-913" aria-hidden="true" tabindex="-1"></a><span class="co">  # Policy for automatically wrapping modules with FSDP</span></span>
-<span id="cb1-914"><a href="#cb1-914" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">auto_wrap_policy</span><span class="kw">:</span><span class="at"> Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None</span></span>
-<span id="cb1-915"><a href="#cb1-915" aria-hidden="true" tabindex="-1"></a><span class="co">  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')</span></span>
-<span id="cb1-916"><a href="#cb1-916" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-917"><a href="#cb1-917" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-918"><a href="#cb1-918" aria-hidden="true" tabindex="-1"></a><span class="co">  # Reshard parameters after forward pass to save memory</span></span>
-<span id="cb1-919"><a href="#cb1-919" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">reshard_after_forward</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-920"><a href="#cb1-920" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')</span></span>
-<span id="cb1-921"><a href="#cb1-921" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">mixed_precision_policy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-922"><a href="#cb1-922" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-923"><a href="#cb1-923" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP version</span></span>
-<span id="cb1-924"><a href="#cb1-924" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-925"><a href="#cb1-925" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
-<span id="cb1-926"><a href="#cb1-926" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-927"><a href="#cb1-927" aria-hidden="true" tabindex="-1"></a><span class="co"># How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for</span></span>
-<span id="cb1-928"><a href="#cb1-928" aria-hidden="true" tabindex="-1"></a><span class="co"># no eval.</span></span>
-<span id="cb1-929"><a href="#cb1-929" aria-hidden="true" tabindex="-1"></a><span class="fu">val_set_size</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-930"><a href="#cb1-930" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-931"><a href="#cb1-931" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to shard across. If not set, will use all available devices.</span></span>
-<span id="cb1-932"><a href="#cb1-932" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_shard_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-933"><a href="#cb1-933" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to replicate across.</span></span>
-<span id="cb1-934"><a href="#cb1-934" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_replicate_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-935"><a href="#cb1-935" aria-hidden="true" tabindex="-1"></a><span class="co"># Deprecated: use `context_parallel_size` instead</span></span>
-<span id="cb1-936"><a href="#cb1-936" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_parallel_degree</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-937"><a href="#cb1-937" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to a divisor of the number of GPUs available to split sequences into chunks of</span></span>
-<span id="cb1-938"><a href="#cb1-938" aria-hidden="true" tabindex="-1"></a><span class="co"># equal size. Use in long context training to prevent OOM when sequences cannot fit into</span></span>
-<span id="cb1-939"><a href="#cb1-939" aria-hidden="true" tabindex="-1"></a><span class="co"># a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each</span></span>
-<span id="cb1-940"><a href="#cb1-940" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized</span></span>
-<span id="cb1-941"><a href="#cb1-941" aria-hidden="true" tabindex="-1"></a><span class="co"># subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more</span></span>
-<span id="cb1-942"><a href="#cb1-942" aria-hidden="true" tabindex="-1"></a><span class="co"># details.</span></span>
-<span id="cb1-943"><a href="#cb1-943" aria-hidden="true" tabindex="-1"></a><span class="fu">context_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-944"><a href="#cb1-944" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional; strides across the key dimension. Larger values use more memory but should</span></span>
-<span id="cb1-945"><a href="#cb1-945" aria-hidden="true" tabindex="-1"></a><span class="co"># make training faster. Must evenly divide the number of KV heads in your model.</span></span>
-<span id="cb1-946"><a href="#cb1-946" aria-hidden="true" tabindex="-1"></a><span class="fu">heads_k_stride</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-947"><a href="#cb1-947" aria-hidden="true" tabindex="-1"></a><span class="co"># One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to</span></span>
-<span id="cb1-948"><a href="#cb1-948" aria-hidden="true" tabindex="-1"></a><span class="co"># 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing</span></span>
-<span id="cb1-949"><a href="#cb1-949" aria-hidden="true" tabindex="-1"></a><span class="co"># case.</span></span>
-<span id="cb1-950"><a href="#cb1-950" aria-hidden="true" tabindex="-1"></a><span class="fu">ring_attn_func</span><span class="kw">:</span><span class="at"> RingAttnFunc | None</span></span>
-<span id="cb1-951"><a href="#cb1-951" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.</span></span>
-<span id="cb1-952"><a href="#cb1-952" aria-hidden="true" tabindex="-1"></a><span class="fu">tensor_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-953"><a href="#cb1-953" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-954"><a href="#cb1-954" aria-hidden="true" tabindex="-1"></a><span class="co"># Add or change special tokens. If you add tokens here, you don't need to add them to</span></span>
-<span id="cb1-955"><a href="#cb1-955" aria-hidden="true" tabindex="-1"></a><span class="co"># the `tokens` list.</span></span>
-<span id="cb1-956"><a href="#cb1-956" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span><span class="at"> SpecialTokensConfig | None</span></span>
-<span id="cb1-957"><a href="#cb1-957" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SpecialTokensConfig:</span></span>
-<span id="cb1-958"><a href="#cb1-958" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">bos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-959"><a href="#cb1-959" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-960"><a href="#cb1-960" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pad_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-961"><a href="#cb1-961" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">unk_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-962"><a href="#cb1-962" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">additional_special_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-963"><a href="#cb1-963" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-964"><a href="#cb1-964" aria-hidden="true" tabindex="-1"></a><span class="co"># Add extra tokens to the tokenizer</span></span>
-<span id="cb1-965"><a href="#cb1-965" aria-hidden="true" tabindex="-1"></a><span class="fu">tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-966"><a href="#cb1-966" aria-hidden="true" tabindex="-1"></a><span class="co"># Mapping token_id to new_token_string to override reserved added_tokens in the</span></span>
-<span id="cb1-967"><a href="#cb1-967" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer. Only works for tokens that are not part of the base vocab (aka are</span></span>
-<span id="cb1-968"><a href="#cb1-968" aria-hidden="true" tabindex="-1"></a><span class="co"># added_tokens). Can be checked if they exist in tokenizer.json added_tokens.</span></span>
-<span id="cb1-969"><a href="#cb1-969" aria-hidden="true" tabindex="-1"></a><span class="fu">added_tokens_overrides</span><span class="kw">:</span><span class="at"> dict[int, str] | None</span></span>
-<span id="cb1-970"><a href="#cb1-970" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-971"><a href="#cb1-971" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use torch.compile and which backend to use. setting to `auto` will enable</span></span>
-<span id="cb1-972"><a href="#cb1-972" aria-hidden="true" tabindex="-1"></a><span class="co"># torch compile when torch&gt;=2.6.0</span></span>
-<span id="cb1-973"><a href="#cb1-973" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
-<span id="cb1-974"><a href="#cb1-974" aria-hidden="true" tabindex="-1"></a><span class="co"># Backend to use for torch.compile</span></span>
-<span id="cb1-975"><a href="#cb1-975" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_backend</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-976"><a href="#cb1-976" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_mode</span><span class="kw">:</span><span class="at"> Literal['default', 'reduce-overhead', 'max-autotune'] | None</span></span>
-<span id="cb1-977"><a href="#cb1-977" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-978"><a href="#cb1-978" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum number of iterations to train for. It precedes num_epochs which means that if</span></span>
-<span id="cb1-979"><a href="#cb1-979" aria-hidden="true" tabindex="-1"></a><span class="co"># both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;</span></span>
-<span id="cb1-980"><a href="#cb1-980" aria-hidden="true" tabindex="-1"></a><span class="co"># `num_epochs: 2` and `max_steps: 100` will train for 100 steps</span></span>
-<span id="cb1-981"><a href="#cb1-981" aria-hidden="true" tabindex="-1"></a><span class="fu">max_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-982"><a href="#cb1-982" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of warmup steps. Cannot use with warmup_ratio</span></span>
-<span id="cb1-983"><a href="#cb1-983" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-984"><a href="#cb1-984" aria-hidden="true" tabindex="-1"></a><span class="co"># Warmup ratio. Cannot use with warmup_steps</span></span>
-<span id="cb1-985"><a href="#cb1-985" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-986"><a href="#cb1-986" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to eval at each epoch, integer for every N steps. float for fraction of</span></span>
-<span id="cb1-987"><a href="#cb1-987" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
-<span id="cb1-988"><a href="#cb1-988" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
-<span id="cb1-989"><a href="#cb1-989" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to run evals, mutually exclusive with eval_steps</span></span>
-<span id="cb1-990"><a href="#cb1-990" aria-hidden="true" tabindex="-1"></a><span class="fu">evals_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-991"><a href="#cb1-991" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer</span></span>
-<span id="cb1-992"><a href="#cb1-992" aria-hidden="true" tabindex="-1"></a><span class="co"># from `eval_steps`</span></span>
-<span id="cb1-993"><a href="#cb1-993" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-994"><a href="#cb1-994" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-995"><a href="#cb1-995" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to save at each epoch, integer for every N steps. float for fraction of</span></span>
-<span id="cb1-996"><a href="#cb1-996" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
-<span id="cb1-997"><a href="#cb1-997" aria-hidden="true" tabindex="-1"></a><span class="fu">save_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
-<span id="cb1-998"><a href="#cb1-998" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to save a checkpoint, mutually exclusive with save_steps</span></span>
-<span id="cb1-999"><a href="#cb1-999" aria-hidden="true" tabindex="-1"></a><span class="fu">saves_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1000"><a href="#cb1-1000" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better</span></span>
-<span id="cb1-1001"><a href="#cb1-1001" aria-hidden="true" tabindex="-1"></a><span class="co"># result is achieved, leave empty to infer from `save_steps`</span></span>
-<span id="cb1-1002"><a href="#cb1-1002" aria-hidden="true" tabindex="-1"></a><span class="fu">save_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1003"><a href="#cb1-1003" aria-hidden="true" tabindex="-1"></a><span class="co"># Checkpoints saved at a time</span></span>
-<span id="cb1-1004"><a href="#cb1-1004" aria-hidden="true" tabindex="-1"></a><span class="fu">save_total_limit</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1005"><a href="#cb1-1005" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to checkpoint a model after the first step of training. Defaults to False.</span></span>
-<span id="cb1-1006"><a href="#cb1-1006" aria-hidden="true" tabindex="-1"></a><span class="fu">save_first_step</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1007"><a href="#cb1-1007" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1008"><a href="#cb1-1008" aria-hidden="true" tabindex="-1"></a><span class="co"># Logging frequency</span></span>
-<span id="cb1-1009"><a href="#cb1-1009" aria-hidden="true" tabindex="-1"></a><span class="fu">logging_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1010"><a href="#cb1-1010" aria-hidden="true" tabindex="-1"></a><span class="co"># Stop training after this many evaluation losses have increased in a row. https://huggi</span></span>
-<span id="cb1-1011"><a href="#cb1-1011" aria-hidden="true" tabindex="-1"></a><span class="co"># ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin</span></span>
-<span id="cb1-1012"><a href="#cb1-1012" aria-hidden="true" tabindex="-1"></a><span class="co"># gCallback</span></span>
-<span id="cb1-1013"><a href="#cb1-1013" aria-hidden="true" tabindex="-1"></a><span class="fu">early_stopping_patience</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1014"><a href="#cb1-1014" aria-hidden="true" tabindex="-1"></a><span class="fu">load_best_model_at_end</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1015"><a href="#cb1-1015" aria-hidden="true" tabindex="-1"></a><span class="co"># Save only the model weights, skipping the optimizer. Using this means you can't resume</span></span>
-<span id="cb1-1016"><a href="#cb1-1016" aria-hidden="true" tabindex="-1"></a><span class="co"># from checkpoints.</span></span>
-<span id="cb1-1017"><a href="#cb1-1017" aria-hidden="true" tabindex="-1"></a><span class="fu">save_only_model</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1018"><a href="#cb1-1018" aria-hidden="true" tabindex="-1"></a><span class="co"># Use tensorboard for logging</span></span>
-<span id="cb1-1019"><a href="#cb1-1019" aria-hidden="true" tabindex="-1"></a><span class="fu">use_tensorboard</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1020"><a href="#cb1-1020" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable the pytorch profiler to capture the first N steps of training to the</span></span>
-<span id="cb1-1021"><a href="#cb1-1021" aria-hidden="true" tabindex="-1"></a><span class="co"># output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more</span></span>
-<span id="cb1-1022"><a href="#cb1-1022" aria-hidden="true" tabindex="-1"></a><span class="co"># information. Snapshots can be visualized @ https://pytorch.org/memory_viz</span></span>
-<span id="cb1-1023"><a href="#cb1-1023" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1024"><a href="#cb1-1024" aria-hidden="true" tabindex="-1"></a><span class="co"># Which step to start the profiler at. Useful for only capturing a few steps mid-run.</span></span>
-<span id="cb1-1025"><a href="#cb1-1025" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps_start</span><span class="kw">:</span><span class="at"> int | None = 0</span></span>
-<span id="cb1-1026"><a href="#cb1-1026" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second at the end of training. This is not</span></span>
-<span id="cb1-1027"><a href="#cb1-1027" aria-hidden="true" tabindex="-1"></a><span class="co"># supported with pre-training datasets.</span></span>
-<span id="cb1-1028"><a href="#cb1-1028" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tokens_per_second</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1029"><a href="#cb1-1029" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second per-gpu during training by measuring</span></span>
-<span id="cb1-1030"><a href="#cb1-1030" aria-hidden="true" tabindex="-1"></a><span class="co"># throughput of non-padding tokens.</span></span>
-<span id="cb1-1031"><a href="#cb1-1031" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tkps</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1032"><a href="#cb1-1032" aria-hidden="true" tabindex="-1"></a><span class="co"># NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to</span></span>
-<span id="cb1-1033"><a href="#cb1-1033" aria-hidden="true" tabindex="-1"></a><span class="co"># add noise to embeddings. Currently only supported on Llama and Mistral</span></span>
-<span id="cb1-1034"><a href="#cb1-1034" aria-hidden="true" tabindex="-1"></a><span class="fu">neftune_noise_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1035"><a href="#cb1-1035" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1036"><a href="#cb1-1036" aria-hidden="true" tabindex="-1"></a><span class="co"># Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to</span></span>
-<span id="cb1-1037"><a href="#cb1-1037" aria-hidden="true" tabindex="-1"></a><span class="co"># `beta` in `ORPOConfig` due to trl mapping.</span></span>
-<span id="cb1-1038"><a href="#cb1-1038" aria-hidden="true" tabindex="-1"></a><span class="fu">orpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1039"><a href="#cb1-1039" aria-hidden="true" tabindex="-1"></a><span class="co"># Weighting of NLL term in loss from RPO paper</span></span>
-<span id="cb1-1040"><a href="#cb1-1040" aria-hidden="true" tabindex="-1"></a><span class="fu">rpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1041"><a href="#cb1-1041" aria-hidden="true" tabindex="-1"></a><span class="co"># Target reward margin for the SimPO loss</span></span>
-<span id="cb1-1042"><a href="#cb1-1042" aria-hidden="true" tabindex="-1"></a><span class="fu">simpo_gamma</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1043"><a href="#cb1-1043" aria-hidden="true" tabindex="-1"></a><span class="co"># Weight of the BC regularizer</span></span>
-<span id="cb1-1044"><a href="#cb1-1044" aria-hidden="true" tabindex="-1"></a><span class="fu">cpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1045"><a href="#cb1-1045" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1046"><a href="#cb1-1046" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for desirable loss term in KTO loss</span></span>
-<span id="cb1-1047"><a href="#cb1-1047" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_desirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1048"><a href="#cb1-1048" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for undesirable loss term in KTO loss</span></span>
-<span id="cb1-1049"><a href="#cb1-1049" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_undesirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1050"><a href="#cb1-1050" aria-hidden="true" tabindex="-1"></a><span class="co"># The beta parameter for the RL training</span></span>
-<span id="cb1-1051"><a href="#cb1-1051" aria-hidden="true" tabindex="-1"></a><span class="fu">rl_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1052"><a href="#cb1-1052" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1053"><a href="#cb1-1053" aria-hidden="true" tabindex="-1"></a><span class="co"># Defines the max memory usage per gpu on the system. Passed through to transformers</span></span>
-<span id="cb1-1054"><a href="#cb1-1054" aria-hidden="true" tabindex="-1"></a><span class="co"># when loading the model.</span></span>
-<span id="cb1-1055"><a href="#cb1-1055" aria-hidden="true" tabindex="-1"></a><span class="fu">max_memory</span><span class="kw">:</span><span class="at"> dict[int | Literal['cpu', 'disk'], int | str] | None</span></span>
-<span id="cb1-1056"><a href="#cb1-1056" aria-hidden="true" tabindex="-1"></a><span class="co"># Limit the memory for all available GPUs to this amount (if an integer, expressed in</span></span>
-<span id="cb1-1057"><a href="#cb1-1057" aria-hidden="true" tabindex="-1"></a><span class="co"># gigabytes); default: unset</span></span>
-<span id="cb1-1058"><a href="#cb1-1058" aria-hidden="true" tabindex="-1"></a><span class="fu">gpu_memory_limit</span><span class="kw">:</span><span class="at"> int | str | None</span></span>
-<span id="cb1-1059"><a href="#cb1-1059" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use low_cpu_mem_usage</span></span>
-<span id="cb1-1060"><a href="#cb1-1060" aria-hidden="true" tabindex="-1"></a><span class="fu">low_cpu_mem_usage</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1061"><a href="#cb1-1061" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1062"><a href="#cb1-1062" aria-hidden="true" tabindex="-1"></a><span class="co"># The name of the chat template to use for training, following values are supported:</span></span>
-<span id="cb1-1063"><a href="#cb1-1063" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default: Uses the chat template that is available in the</span></span>
-<span id="cb1-1064"><a href="#cb1-1064" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_config.json. If the chat template is not available in the tokenizer, it will</span></span>
-<span id="cb1-1065"><a href="#cb1-1065" aria-hidden="true" tabindex="-1"></a><span class="co"># raise an error. This is the default value.</span></span>
-<span id="cb1-1066"><a href="#cb1-1066" aria-hidden="true" tabindex="-1"></a><span class="co"># alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
-<span id="cb1-1067"><a href="#cb1-1067" aria-hidden="true" tabindex="-1"></a><span class="co"># are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
-<span id="cb1-1068"><a href="#cb1-1068" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.</span></span>
-<span id="cb1-1069"><a href="#cb1-1069" aria-hidden="true" tabindex="-1"></a><span class="co"># E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not</span></span>
-<span id="cb1-1070"><a href="#cb1-1070" aria-hidden="true" tabindex="-1"></a><span class="co"># available in the tokenizer. jinja: Uses a custom jinja template for the chat template.</span></span>
-<span id="cb1-1071"><a href="#cb1-1071" aria-hidden="true" tabindex="-1"></a><span class="co"># The custom jinja template should be provided in the chat_template_jinja field. The</span></span>
-<span id="cb1-1072"><a href="#cb1-1072" aria-hidden="true" tabindex="-1"></a><span class="co"># selected chat template will be saved to the tokenizer_config.json for easier</span></span>
-<span id="cb1-1073"><a href="#cb1-1073" aria-hidden="true" tabindex="-1"></a><span class="co"># inferencing</span></span>
-<span id="cb1-1074"><a href="#cb1-1074" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None</span></span>
-<span id="cb1-1075"><a href="#cb1-1075" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom jinja template or path to jinja file for chat template. This will be only used</span></span>
-<span id="cb1-1076"><a href="#cb1-1076" aria-hidden="true" tabindex="-1"></a><span class="co"># if chat_template is set to `jinja` or `null` (in which case chat_template is</span></span>
-<span id="cb1-1077"><a href="#cb1-1077" aria-hidden="true" tabindex="-1"></a><span class="co"># automatically set to `jinja`). Default is null.</span></span>
-<span id="cb1-1078"><a href="#cb1-1078" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1079"><a href="#cb1-1079" aria-hidden="true" tabindex="-1"></a><span class="co"># Additional kwargs to pass to the chat template. This is useful for customizing the</span></span>
-<span id="cb1-1080"><a href="#cb1-1080" aria-hidden="true" tabindex="-1"></a><span class="co"># chat template. For example, you can pass `thinking=False` to add a generation prompt</span></span>
-<span id="cb1-1081"><a href="#cb1-1081" aria-hidden="true" tabindex="-1"></a><span class="co"># to the chat template.</span></span>
-<span id="cb1-1082"><a href="#cb1-1082" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1083"><a href="#cb1-1083" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the</span></span>
-<span id="cb1-1084"><a href="#cb1-1084" aria-hidden="true" tabindex="-1"></a><span class="co"># boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',</span></span>
-<span id="cb1-1085"><a href="#cb1-1085" aria-hidden="true" tabindex="-1"></a><span class="co"># '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is</span></span>
-<span id="cb1-1086"><a href="#cb1-1086" aria-hidden="true" tabindex="-1"></a><span class="co"># useful for templates that use multiple delimiter tokens.</span></span>
-<span id="cb1-1087"><a href="#cb1-1087" aria-hidden="true" tabindex="-1"></a><span class="fu">eot_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1088"><a href="#cb1-1088" aria-hidden="true" tabindex="-1"></a><span class="co"># Changes the default system message. Currently only supports chatml.</span></span>
-<span id="cb1-1089"><a href="#cb1-1089" aria-hidden="true" tabindex="-1"></a><span class="fu">default_system_message</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1090"><a href="#cb1-1090" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1091"><a href="#cb1-1091" aria-hidden="true" tabindex="-1"></a><span class="co"># Token index or indices to adjust embedding weights to the mean of the other tokens.</span></span>
-<span id="cb1-1092"><a href="#cb1-1092" aria-hidden="true" tabindex="-1"></a><span class="co"># This is useful when the model has untrained embeddings.</span></span>
-<span id="cb1-1093"><a href="#cb1-1093" aria-hidden="true" tabindex="-1"></a><span class="fu">fix_untrained_tokens</span><span class="kw">:</span><span class="at"> int | list[int] | None</span></span>
-<span id="cb1-1094"><a href="#cb1-1094" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1095"><a href="#cb1-1095" aria-hidden="true" tabindex="-1"></a><span class="fu">is_preprocess</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1096"><a href="#cb1-1096" aria-hidden="true" tabindex="-1"></a><span class="fu">preprocess_iterable</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1097"><a href="#cb1-1097" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1098"><a href="#cb1-1098" aria-hidden="true" tabindex="-1"></a><span class="co"># Total number of tokens - internal use</span></span>
-<span id="cb1-1099"><a href="#cb1-1099" aria-hidden="true" tabindex="-1"></a><span class="fu">total_num_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1100"><a href="#cb1-1100" aria-hidden="true" tabindex="-1"></a><span class="fu">total_supervised_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1101"><a href="#cb1-1101" aria-hidden="true" tabindex="-1"></a><span class="co"># You can set these packing optimizations AFTER starting a training at least once. The</span></span>
-<span id="cb1-1102"><a href="#cb1-1102" aria-hidden="true" tabindex="-1"></a><span class="co"># trainer will provide recommended values for these values.</span></span>
-<span id="cb1-1103"><a href="#cb1-1103" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_eff_est</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1104"><a href="#cb1-1104" aria-hidden="true" tabindex="-1"></a><span class="fu">axolotl_config_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1105"><a href="#cb1-1105" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1106"><a href="#cb1-1106" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1107"><a href="#cb1-1107" aria-hidden="true" tabindex="-1"></a><span class="fu">is_falcon_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-882"><a href="#cb1-882" aria-hidden="true" tabindex="-1"></a><span class="fu">llama4_linearized_experts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-883"><a href="#cb1-883" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-884"><a href="#cb1-884" aria-hidden="true" tabindex="-1"></a><span class="co"># Deepspeed config path. e.g., deepspeed_configs/zero3.json</span></span>
+<span id="cb1-885"><a href="#cb1-885" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
+<span id="cb1-886"><a href="#cb1-886" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use deepcompile for faster training with deepspeed</span></span>
+<span id="cb1-887"><a href="#cb1-887" aria-hidden="true" tabindex="-1"></a><span class="fu">deepcompile</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-888"><a href="#cb1-888" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration</span></span>
+<span id="cb1-889"><a href="#cb1-889" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-890"><a href="#cb1-890" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-891"><a href="#cb1-891" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration options</span></span>
+<span id="cb1-892"><a href="#cb1-892" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span><span class="at"> FSDPConfig | None</span></span>
+<span id="cb1-893"><a href="#cb1-893" aria-hidden="true" tabindex="-1"></a><span class="co">  # For FSDPConfig:</span></span>
+<span id="cb1-894"><a href="#cb1-894" aria-hidden="true" tabindex="-1"></a><span class="co">  # FSDP version</span></span>
+<span id="cb1-895"><a href="#cb1-895" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-896"><a href="#cb1-896" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable activation checkpointing to reduce memory usage during forward passes</span></span>
+<span id="cb1-897"><a href="#cb1-897" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">activation_checkpointing</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-898"><a href="#cb1-898" aria-hidden="true" tabindex="-1"></a><span class="co">  # Offload parameters to CPU to reduce GPU memory usage</span></span>
+<span id="cb1-899"><a href="#cb1-899" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">offload_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-900"><a href="#cb1-900" aria-hidden="true" tabindex="-1"></a><span class="co">  # Synchronize module states across all processes</span></span>
+<span id="cb1-901"><a href="#cb1-901" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">sync_module_states</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-902"><a href="#cb1-902" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable CPU RAM efficient loading to reduce memory usage during model loading</span></span>
+<span id="cb1-903"><a href="#cb1-903" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-904"><a href="#cb1-904" aria-hidden="true" tabindex="-1"></a><span class="co">  # Disabling this enables swap memory usage for resource-constrained setups when</span></span>
+<span id="cb1-905"><a href="#cb1-905" aria-hidden="true" tabindex="-1"></a><span class="co">  # offload_params is enabled.</span></span>
+<span id="cb1-906"><a href="#cb1-906" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_offload_pin_memory</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-907"><a href="#cb1-907" aria-hidden="true" tabindex="-1"></a><span class="co">  # Use original parameters instead of flattened parameters</span></span>
+<span id="cb1-908"><a href="#cb1-908" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">use_orig_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-909"><a href="#cb1-909" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-910"><a href="#cb1-910" aria-hidden="true" tabindex="-1"></a><span class="co">  # Type of state dict to use for saving/loading checkpoints</span></span>
+<span id="cb1-911"><a href="#cb1-911" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
+<span id="cb1-912"><a href="#cb1-912" aria-hidden="true" tabindex="-1"></a><span class="co">  # Final state dict type to use after training completion</span></span>
+<span id="cb1-913"><a href="#cb1-913" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
+<span id="cb1-914"><a href="#cb1-914" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-915"><a href="#cb1-915" aria-hidden="true" tabindex="-1"></a><span class="co">  # Policy for automatically wrapping modules with FSDP</span></span>
+<span id="cb1-916"><a href="#cb1-916" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">auto_wrap_policy</span><span class="kw">:</span><span class="at"> Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None</span></span>
+<span id="cb1-917"><a href="#cb1-917" aria-hidden="true" tabindex="-1"></a><span class="co">  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')</span></span>
+<span id="cb1-918"><a href="#cb1-918" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-919"><a href="#cb1-919" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-920"><a href="#cb1-920" aria-hidden="true" tabindex="-1"></a><span class="co">  # Reshard parameters after forward pass to save memory</span></span>
+<span id="cb1-921"><a href="#cb1-921" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">reshard_after_forward</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-922"><a href="#cb1-922" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')</span></span>
+<span id="cb1-923"><a href="#cb1-923" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">mixed_precision_policy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-924"><a href="#cb1-924" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-925"><a href="#cb1-925" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP version</span></span>
+<span id="cb1-926"><a href="#cb1-926" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-927"><a href="#cb1-927" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
+<span id="cb1-928"><a href="#cb1-928" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-929"><a href="#cb1-929" aria-hidden="true" tabindex="-1"></a><span class="co"># How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for</span></span>
+<span id="cb1-930"><a href="#cb1-930" aria-hidden="true" tabindex="-1"></a><span class="co"># no eval.</span></span>
+<span id="cb1-931"><a href="#cb1-931" aria-hidden="true" tabindex="-1"></a><span class="fu">val_set_size</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-932"><a href="#cb1-932" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-933"><a href="#cb1-933" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to shard across. If not set, will use all available devices.</span></span>
+<span id="cb1-934"><a href="#cb1-934" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_shard_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-935"><a href="#cb1-935" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to replicate across.</span></span>
+<span id="cb1-936"><a href="#cb1-936" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_replicate_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-937"><a href="#cb1-937" aria-hidden="true" tabindex="-1"></a><span class="co"># Deprecated: use `context_parallel_size` instead</span></span>
+<span id="cb1-938"><a href="#cb1-938" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_parallel_degree</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-939"><a href="#cb1-939" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to a divisor of the number of GPUs available to split sequences into chunks of</span></span>
+<span id="cb1-940"><a href="#cb1-940" aria-hidden="true" tabindex="-1"></a><span class="co"># equal size. Use in long context training to prevent OOM when sequences cannot fit into</span></span>
+<span id="cb1-941"><a href="#cb1-941" aria-hidden="true" tabindex="-1"></a><span class="co"># a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each</span></span>
+<span id="cb1-942"><a href="#cb1-942" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized</span></span>
+<span id="cb1-943"><a href="#cb1-943" aria-hidden="true" tabindex="-1"></a><span class="co"># subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more</span></span>
+<span id="cb1-944"><a href="#cb1-944" aria-hidden="true" tabindex="-1"></a><span class="co"># details.</span></span>
+<span id="cb1-945"><a href="#cb1-945" aria-hidden="true" tabindex="-1"></a><span class="fu">context_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-946"><a href="#cb1-946" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional; strides across the key dimension. Larger values use more memory but should</span></span>
+<span id="cb1-947"><a href="#cb1-947" aria-hidden="true" tabindex="-1"></a><span class="co"># make training faster. Must evenly divide the number of KV heads in your model.</span></span>
+<span id="cb1-948"><a href="#cb1-948" aria-hidden="true" tabindex="-1"></a><span class="fu">heads_k_stride</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-949"><a href="#cb1-949" aria-hidden="true" tabindex="-1"></a><span class="co"># One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to</span></span>
+<span id="cb1-950"><a href="#cb1-950" aria-hidden="true" tabindex="-1"></a><span class="co"># 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing</span></span>
+<span id="cb1-951"><a href="#cb1-951" aria-hidden="true" tabindex="-1"></a><span class="co"># case.</span></span>
+<span id="cb1-952"><a href="#cb1-952" aria-hidden="true" tabindex="-1"></a><span class="fu">ring_attn_func</span><span class="kw">:</span><span class="at"> RingAttnFunc | None</span></span>
+<span id="cb1-953"><a href="#cb1-953" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.</span></span>
+<span id="cb1-954"><a href="#cb1-954" aria-hidden="true" tabindex="-1"></a><span class="fu">tensor_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-955"><a href="#cb1-955" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-956"><a href="#cb1-956" aria-hidden="true" tabindex="-1"></a><span class="co"># Add or change special tokens. If you add tokens here, you don't need to add them to</span></span>
+<span id="cb1-957"><a href="#cb1-957" aria-hidden="true" tabindex="-1"></a><span class="co"># the `tokens` list.</span></span>
+<span id="cb1-958"><a href="#cb1-958" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span><span class="at"> SpecialTokensConfig | None</span></span>
+<span id="cb1-959"><a href="#cb1-959" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SpecialTokensConfig:</span></span>
+<span id="cb1-960"><a href="#cb1-960" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">bos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-961"><a href="#cb1-961" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-962"><a href="#cb1-962" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pad_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-963"><a href="#cb1-963" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">unk_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-964"><a href="#cb1-964" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">additional_special_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-965"><a href="#cb1-965" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-966"><a href="#cb1-966" aria-hidden="true" tabindex="-1"></a><span class="co"># Add extra tokens to the tokenizer</span></span>
+<span id="cb1-967"><a href="#cb1-967" aria-hidden="true" tabindex="-1"></a><span class="fu">tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-968"><a href="#cb1-968" aria-hidden="true" tabindex="-1"></a><span class="co"># Mapping token_id to new_token_string to override reserved added_tokens in the</span></span>
+<span id="cb1-969"><a href="#cb1-969" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer. Only works for tokens that are not part of the base vocab (aka are</span></span>
+<span id="cb1-970"><a href="#cb1-970" aria-hidden="true" tabindex="-1"></a><span class="co"># added_tokens). Can be checked if they exist in tokenizer.json added_tokens.</span></span>
+<span id="cb1-971"><a href="#cb1-971" aria-hidden="true" tabindex="-1"></a><span class="fu">added_tokens_overrides</span><span class="kw">:</span><span class="at"> dict[int, str] | None</span></span>
+<span id="cb1-972"><a href="#cb1-972" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-973"><a href="#cb1-973" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use torch.compile and which backend to use. setting to `auto` will enable</span></span>
+<span id="cb1-974"><a href="#cb1-974" aria-hidden="true" tabindex="-1"></a><span class="co"># torch compile when torch&gt;=2.6.0</span></span>
+<span id="cb1-975"><a href="#cb1-975" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
+<span id="cb1-976"><a href="#cb1-976" aria-hidden="true" tabindex="-1"></a><span class="co"># Backend to use for torch.compile</span></span>
+<span id="cb1-977"><a href="#cb1-977" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_backend</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-978"><a href="#cb1-978" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_mode</span><span class="kw">:</span><span class="at"> Literal['default', 'reduce-overhead', 'max-autotune'] | None</span></span>
+<span id="cb1-979"><a href="#cb1-979" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-980"><a href="#cb1-980" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum number of iterations to train for. It precedes num_epochs which means that if</span></span>
+<span id="cb1-981"><a href="#cb1-981" aria-hidden="true" tabindex="-1"></a><span class="co"># both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;</span></span>
+<span id="cb1-982"><a href="#cb1-982" aria-hidden="true" tabindex="-1"></a><span class="co"># `num_epochs: 2` and `max_steps: 100` will train for 100 steps</span></span>
+<span id="cb1-983"><a href="#cb1-983" aria-hidden="true" tabindex="-1"></a><span class="fu">max_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-984"><a href="#cb1-984" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of warmup steps. Cannot use with warmup_ratio</span></span>
+<span id="cb1-985"><a href="#cb1-985" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-986"><a href="#cb1-986" aria-hidden="true" tabindex="-1"></a><span class="co"># Warmup ratio. Cannot use with warmup_steps</span></span>
+<span id="cb1-987"><a href="#cb1-987" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-988"><a href="#cb1-988" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to eval at each epoch, integer for every N steps. float for fraction of</span></span>
+<span id="cb1-989"><a href="#cb1-989" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
+<span id="cb1-990"><a href="#cb1-990" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
+<span id="cb1-991"><a href="#cb1-991" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to run evals, mutually exclusive with eval_steps</span></span>
+<span id="cb1-992"><a href="#cb1-992" aria-hidden="true" tabindex="-1"></a><span class="fu">evals_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-993"><a href="#cb1-993" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer</span></span>
+<span id="cb1-994"><a href="#cb1-994" aria-hidden="true" tabindex="-1"></a><span class="co"># from `eval_steps`</span></span>
+<span id="cb1-995"><a href="#cb1-995" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-996"><a href="#cb1-996" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-997"><a href="#cb1-997" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to save at each epoch, integer for every N steps. float for fraction of</span></span>
+<span id="cb1-998"><a href="#cb1-998" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
+<span id="cb1-999"><a href="#cb1-999" aria-hidden="true" tabindex="-1"></a><span class="fu">save_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
+<span id="cb1-1000"><a href="#cb1-1000" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to save a checkpoint, mutually exclusive with save_steps</span></span>
+<span id="cb1-1001"><a href="#cb1-1001" aria-hidden="true" tabindex="-1"></a><span class="fu">saves_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1002"><a href="#cb1-1002" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better</span></span>
+<span id="cb1-1003"><a href="#cb1-1003" aria-hidden="true" tabindex="-1"></a><span class="co"># result is achieved, leave empty to infer from `save_steps`</span></span>
+<span id="cb1-1004"><a href="#cb1-1004" aria-hidden="true" tabindex="-1"></a><span class="fu">save_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1005"><a href="#cb1-1005" aria-hidden="true" tabindex="-1"></a><span class="co"># Checkpoints saved at a time</span></span>
+<span id="cb1-1006"><a href="#cb1-1006" aria-hidden="true" tabindex="-1"></a><span class="fu">save_total_limit</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1007"><a href="#cb1-1007" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to checkpoint a model after the first step of training. Defaults to False.</span></span>
+<span id="cb1-1008"><a href="#cb1-1008" aria-hidden="true" tabindex="-1"></a><span class="fu">save_first_step</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1009"><a href="#cb1-1009" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1010"><a href="#cb1-1010" aria-hidden="true" tabindex="-1"></a><span class="co"># Logging frequency</span></span>
+<span id="cb1-1011"><a href="#cb1-1011" aria-hidden="true" tabindex="-1"></a><span class="fu">logging_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1012"><a href="#cb1-1012" aria-hidden="true" tabindex="-1"></a><span class="co"># Stop training after this many evaluation losses have increased in a row. https://huggi</span></span>
+<span id="cb1-1013"><a href="#cb1-1013" aria-hidden="true" tabindex="-1"></a><span class="co"># ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin</span></span>
+<span id="cb1-1014"><a href="#cb1-1014" aria-hidden="true" tabindex="-1"></a><span class="co"># gCallback</span></span>
+<span id="cb1-1015"><a href="#cb1-1015" aria-hidden="true" tabindex="-1"></a><span class="fu">early_stopping_patience</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1016"><a href="#cb1-1016" aria-hidden="true" tabindex="-1"></a><span class="fu">load_best_model_at_end</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1017"><a href="#cb1-1017" aria-hidden="true" tabindex="-1"></a><span class="co"># Save only the model weights, skipping the optimizer. Using this means you can't resume</span></span>
+<span id="cb1-1018"><a href="#cb1-1018" aria-hidden="true" tabindex="-1"></a><span class="co"># from checkpoints.</span></span>
+<span id="cb1-1019"><a href="#cb1-1019" aria-hidden="true" tabindex="-1"></a><span class="fu">save_only_model</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1020"><a href="#cb1-1020" aria-hidden="true" tabindex="-1"></a><span class="co"># Use tensorboard for logging</span></span>
+<span id="cb1-1021"><a href="#cb1-1021" aria-hidden="true" tabindex="-1"></a><span class="fu">use_tensorboard</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1022"><a href="#cb1-1022" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable the pytorch profiler to capture the first N steps of training to the</span></span>
+<span id="cb1-1023"><a href="#cb1-1023" aria-hidden="true" tabindex="-1"></a><span class="co"># output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more</span></span>
+<span id="cb1-1024"><a href="#cb1-1024" aria-hidden="true" tabindex="-1"></a><span class="co"># information. Snapshots can be visualized @ https://pytorch.org/memory_viz</span></span>
+<span id="cb1-1025"><a href="#cb1-1025" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1026"><a href="#cb1-1026" aria-hidden="true" tabindex="-1"></a><span class="co"># Which step to start the profiler at. Useful for only capturing a few steps mid-run.</span></span>
+<span id="cb1-1027"><a href="#cb1-1027" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps_start</span><span class="kw">:</span><span class="at"> int | None = 0</span></span>
+<span id="cb1-1028"><a href="#cb1-1028" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second at the end of training. This is not</span></span>
+<span id="cb1-1029"><a href="#cb1-1029" aria-hidden="true" tabindex="-1"></a><span class="co"># supported with pre-training datasets.</span></span>
+<span id="cb1-1030"><a href="#cb1-1030" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tokens_per_second</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1031"><a href="#cb1-1031" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second per-gpu during training by measuring</span></span>
+<span id="cb1-1032"><a href="#cb1-1032" aria-hidden="true" tabindex="-1"></a><span class="co"># throughput of non-padding tokens.</span></span>
+<span id="cb1-1033"><a href="#cb1-1033" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tkps</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1034"><a href="#cb1-1034" aria-hidden="true" tabindex="-1"></a><span class="co"># NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to</span></span>
+<span id="cb1-1035"><a href="#cb1-1035" aria-hidden="true" tabindex="-1"></a><span class="co"># add noise to embeddings. Currently only supported on Llama and Mistral</span></span>
+<span id="cb1-1036"><a href="#cb1-1036" aria-hidden="true" tabindex="-1"></a><span class="fu">neftune_noise_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1037"><a href="#cb1-1037" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1038"><a href="#cb1-1038" aria-hidden="true" tabindex="-1"></a><span class="co"># Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to</span></span>
+<span id="cb1-1039"><a href="#cb1-1039" aria-hidden="true" tabindex="-1"></a><span class="co"># `beta` in `ORPOConfig` due to trl mapping.</span></span>
+<span id="cb1-1040"><a href="#cb1-1040" aria-hidden="true" tabindex="-1"></a><span class="fu">orpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1041"><a href="#cb1-1041" aria-hidden="true" tabindex="-1"></a><span class="co"># Weighting of NLL term in loss from RPO paper</span></span>
+<span id="cb1-1042"><a href="#cb1-1042" aria-hidden="true" tabindex="-1"></a><span class="fu">rpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1043"><a href="#cb1-1043" aria-hidden="true" tabindex="-1"></a><span class="co"># Target reward margin for the SimPO loss</span></span>
+<span id="cb1-1044"><a href="#cb1-1044" aria-hidden="true" tabindex="-1"></a><span class="fu">simpo_gamma</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1045"><a href="#cb1-1045" aria-hidden="true" tabindex="-1"></a><span class="co"># Weight of the BC regularizer</span></span>
+<span id="cb1-1046"><a href="#cb1-1046" aria-hidden="true" tabindex="-1"></a><span class="fu">cpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1047"><a href="#cb1-1047" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1048"><a href="#cb1-1048" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for desirable loss term in KTO loss</span></span>
+<span id="cb1-1049"><a href="#cb1-1049" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_desirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1050"><a href="#cb1-1050" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for undesirable loss term in KTO loss</span></span>
+<span id="cb1-1051"><a href="#cb1-1051" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_undesirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1052"><a href="#cb1-1052" aria-hidden="true" tabindex="-1"></a><span class="co"># The beta parameter for the RL training</span></span>
+<span id="cb1-1053"><a href="#cb1-1053" aria-hidden="true" tabindex="-1"></a><span class="fu">rl_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1054"><a href="#cb1-1054" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1055"><a href="#cb1-1055" aria-hidden="true" tabindex="-1"></a><span class="co"># Defines the max memory usage per gpu on the system. Passed through to transformers</span></span>
+<span id="cb1-1056"><a href="#cb1-1056" aria-hidden="true" tabindex="-1"></a><span class="co"># when loading the model.</span></span>
+<span id="cb1-1057"><a href="#cb1-1057" aria-hidden="true" tabindex="-1"></a><span class="fu">max_memory</span><span class="kw">:</span><span class="at"> dict[int | Literal['cpu', 'disk'], int | str] | None</span></span>
+<span id="cb1-1058"><a href="#cb1-1058" aria-hidden="true" tabindex="-1"></a><span class="co"># Limit the memory for all available GPUs to this amount (if an integer, expressed in</span></span>
+<span id="cb1-1059"><a href="#cb1-1059" aria-hidden="true" tabindex="-1"></a><span class="co"># gigabytes); default: unset</span></span>
+<span id="cb1-1060"><a href="#cb1-1060" aria-hidden="true" tabindex="-1"></a><span class="fu">gpu_memory_limit</span><span class="kw">:</span><span class="at"> int | str | None</span></span>
+<span id="cb1-1061"><a href="#cb1-1061" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use low_cpu_mem_usage</span></span>
+<span id="cb1-1062"><a href="#cb1-1062" aria-hidden="true" tabindex="-1"></a><span class="fu">low_cpu_mem_usage</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1063"><a href="#cb1-1063" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1064"><a href="#cb1-1064" aria-hidden="true" tabindex="-1"></a><span class="co"># The name of the chat template to use for training, following values are supported:</span></span>
+<span id="cb1-1065"><a href="#cb1-1065" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default: Uses the chat template that is available in the</span></span>
+<span id="cb1-1066"><a href="#cb1-1066" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_config.json. If the chat template is not available in the tokenizer, it will</span></span>
+<span id="cb1-1067"><a href="#cb1-1067" aria-hidden="true" tabindex="-1"></a><span class="co"># raise an error. This is the default value.</span></span>
+<span id="cb1-1068"><a href="#cb1-1068" aria-hidden="true" tabindex="-1"></a><span class="co"># alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
+<span id="cb1-1069"><a href="#cb1-1069" aria-hidden="true" tabindex="-1"></a><span class="co"># are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
+<span id="cb1-1070"><a href="#cb1-1070" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.</span></span>
+<span id="cb1-1071"><a href="#cb1-1071" aria-hidden="true" tabindex="-1"></a><span class="co"># E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not</span></span>
+<span id="cb1-1072"><a href="#cb1-1072" aria-hidden="true" tabindex="-1"></a><span class="co"># available in the tokenizer. jinja: Uses a custom jinja template for the chat template.</span></span>
+<span id="cb1-1073"><a href="#cb1-1073" aria-hidden="true" tabindex="-1"></a><span class="co"># The custom jinja template should be provided in the chat_template_jinja field. The</span></span>
+<span id="cb1-1074"><a href="#cb1-1074" aria-hidden="true" tabindex="-1"></a><span class="co"># selected chat template will be saved to the tokenizer_config.json for easier</span></span>
+<span id="cb1-1075"><a href="#cb1-1075" aria-hidden="true" tabindex="-1"></a><span class="co"># inferencing</span></span>
+<span id="cb1-1076"><a href="#cb1-1076" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None</span></span>
+<span id="cb1-1077"><a href="#cb1-1077" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom jinja template or path to jinja file for chat template. This will be only used</span></span>
+<span id="cb1-1078"><a href="#cb1-1078" aria-hidden="true" tabindex="-1"></a><span class="co"># if chat_template is set to `jinja` or `null` (in which case chat_template is</span></span>
+<span id="cb1-1079"><a href="#cb1-1079" aria-hidden="true" tabindex="-1"></a><span class="co"># automatically set to `jinja`). Default is null.</span></span>
+<span id="cb1-1080"><a href="#cb1-1080" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1081"><a href="#cb1-1081" aria-hidden="true" tabindex="-1"></a><span class="co"># Additional kwargs to pass to the chat template. This is useful for customizing the</span></span>
+<span id="cb1-1082"><a href="#cb1-1082" aria-hidden="true" tabindex="-1"></a><span class="co"># chat template. For example, you can pass `thinking=False` to add a generation prompt</span></span>
+<span id="cb1-1083"><a href="#cb1-1083" aria-hidden="true" tabindex="-1"></a><span class="co"># to the chat template.</span></span>
+<span id="cb1-1084"><a href="#cb1-1084" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1085"><a href="#cb1-1085" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the</span></span>
+<span id="cb1-1086"><a href="#cb1-1086" aria-hidden="true" tabindex="-1"></a><span class="co"># boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',</span></span>
+<span id="cb1-1087"><a href="#cb1-1087" aria-hidden="true" tabindex="-1"></a><span class="co"># '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is</span></span>
+<span id="cb1-1088"><a href="#cb1-1088" aria-hidden="true" tabindex="-1"></a><span class="co"># useful for templates that use multiple delimiter tokens.</span></span>
+<span id="cb1-1089"><a href="#cb1-1089" aria-hidden="true" tabindex="-1"></a><span class="fu">eot_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1090"><a href="#cb1-1090" aria-hidden="true" tabindex="-1"></a><span class="co"># Changes the default system message. Currently only supports chatml.</span></span>
+<span id="cb1-1091"><a href="#cb1-1091" aria-hidden="true" tabindex="-1"></a><span class="fu">default_system_message</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1092"><a href="#cb1-1092" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1093"><a href="#cb1-1093" aria-hidden="true" tabindex="-1"></a><span class="co"># Token index or indices to adjust embedding weights to the mean of the other tokens.</span></span>
+<span id="cb1-1094"><a href="#cb1-1094" aria-hidden="true" tabindex="-1"></a><span class="co"># This is useful when the model has untrained embeddings.</span></span>
+<span id="cb1-1095"><a href="#cb1-1095" aria-hidden="true" tabindex="-1"></a><span class="fu">fix_untrained_tokens</span><span class="kw">:</span><span class="at"> int | list[int] | None</span></span>
+<span id="cb1-1096"><a href="#cb1-1096" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1097"><a href="#cb1-1097" aria-hidden="true" tabindex="-1"></a><span class="fu">is_preprocess</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1098"><a href="#cb1-1098" aria-hidden="true" tabindex="-1"></a><span class="fu">preprocess_iterable</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1099"><a href="#cb1-1099" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1100"><a href="#cb1-1100" aria-hidden="true" tabindex="-1"></a><span class="co"># Total number of tokens - internal use</span></span>
+<span id="cb1-1101"><a href="#cb1-1101" aria-hidden="true" tabindex="-1"></a><span class="fu">total_num_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1102"><a href="#cb1-1102" aria-hidden="true" tabindex="-1"></a><span class="fu">total_supervised_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1103"><a href="#cb1-1103" aria-hidden="true" tabindex="-1"></a><span class="co"># You can set these packing optimizations AFTER starting a training at least once. The</span></span>
+<span id="cb1-1104"><a href="#cb1-1104" aria-hidden="true" tabindex="-1"></a><span class="co"># trainer will provide recommended values for these values.</span></span>
+<span id="cb1-1105"><a href="#cb1-1105" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_eff_est</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1106"><a href="#cb1-1106" aria-hidden="true" tabindex="-1"></a><span class="fu">axolotl_config_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1107"><a href="#cb1-1107" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb1-1108"><a href="#cb1-1108" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1109"><a href="#cb1-1109" aria-hidden="true" tabindex="-1"></a><span class="fu">is_llama_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1110"><a href="#cb1-1110" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on. Please note that if</span></span>
-<span id="cb1-1111"><a href="#cb1-1111" aria-hidden="true" tabindex="-1"></a><span class="co"># you set this to true, `padding_side` will be set to 'left' by default</span></span>
-<span id="cb1-1112"><a href="#cb1-1112" aria-hidden="true" tabindex="-1"></a><span class="fu">is_mistral_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1113"><a href="#cb1-1113" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1114"><a href="#cb1-1114" aria-hidden="true" tabindex="-1"></a><span class="fu">is_qwen_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1115"><a href="#cb1-1115" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1116"><a href="#cb1-1116" aria-hidden="true" tabindex="-1"></a><span class="co"># Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available</span></span>
-<span id="cb1-1117"><a href="#cb1-1117" aria-hidden="true" tabindex="-1"></a><span class="co"># plugins or doc below for more details.</span></span>
-<span id="cb1-1118"><a href="#cb1-1118" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/custom_integrations.html</span></span>
-<span id="cb1-1119"><a href="#cb1-1119" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1120"><a href="#cb1-1120" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1121"><a href="#cb1-1121" aria-hidden="true" tabindex="-1"></a><span class="co"># This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This</span></span>
-<span id="cb1-1122"><a href="#cb1-1122" aria-hidden="true" tabindex="-1"></a><span class="co"># can also be a relative path to a model on disk</span></span>
-<span id="cb1-1123"><a href="#cb1-1123" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> str (required)</span></span>
-<span id="cb1-1124"><a href="#cb1-1124" aria-hidden="true" tabindex="-1"></a><span class="co"># If the base_model repo on hf hub doesn't include configuration .json files, You can</span></span>
-<span id="cb1-1125"><a href="#cb1-1125" aria-hidden="true" tabindex="-1"></a><span class="co"># set that here, or leave this empty to default to base_model</span></span>
-<span id="cb1-1126"><a href="#cb1-1126" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1127"><a href="#cb1-1127" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to</span></span>
-<span id="cb1-1128"><a href="#cb1-1128" aria-hidden="true" tabindex="-1"></a><span class="co"># AutoConfig.</span></span>
-<span id="cb1-1129"><a href="#cb1-1129" aria-hidden="true" tabindex="-1"></a><span class="fu">cls_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1130"><a href="#cb1-1130" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional tokenizer configuration path in case you want to use a different tokenizer</span></span>
-<span id="cb1-1131"><a href="#cb1-1131" aria-hidden="true" tabindex="-1"></a><span class="co"># than the one defined in the base model</span></span>
-<span id="cb1-1132"><a href="#cb1-1132" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1133"><a href="#cb1-1133" aria-hidden="true" tabindex="-1"></a><span class="co"># use_fast option for tokenizer loading from_pretrained, default to True</span></span>
-<span id="cb1-1134"><a href="#cb1-1134" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_fast</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1135"><a href="#cb1-1135" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use the legacy tokenizer setting, defaults to True</span></span>
-<span id="cb1-1136"><a href="#cb1-1136" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_legacy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1137"><a href="#cb1-1137" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use mistral-common tokenizer. If set to True, it will use the mistral-</span></span>
-<span id="cb1-1138"><a href="#cb1-1138" aria-hidden="true" tabindex="-1"></a><span class="co"># common tokenizer.</span></span>
-<span id="cb1-1139"><a href="#cb1-1139" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_mistral_common</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1140"><a href="#cb1-1140" aria-hidden="true" tabindex="-1"></a><span class="co"># Corresponding tokenizer for the model AutoTokenizer is a good choice</span></span>
-<span id="cb1-1141"><a href="#cb1-1141" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1142"><a href="#cb1-1142" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers processor class</span></span>
-<span id="cb1-1143"><a href="#cb1-1143" aria-hidden="true" tabindex="-1"></a><span class="fu">processor_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1144"><a href="#cb1-1144" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save jinja files for tokenizer, transformers default is True</span></span>
-<span id="cb1-1145"><a href="#cb1-1145" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_save_jinja_files</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1146"><a href="#cb1-1146" aria-hidden="true" tabindex="-1"></a><span class="co"># Trust remote code for untrusted source</span></span>
-<span id="cb1-1147"><a href="#cb1-1147" aria-hidden="true" tabindex="-1"></a><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1148"><a href="#cb1-1148" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1149"><a href="#cb1-1149" aria-hidden="true" tabindex="-1"></a><span class="co"># Don't move the model to the device before sharding. Set to `false` to revert to legacy</span></span>
-<span id="cb1-1150"><a href="#cb1-1150" aria-hidden="true" tabindex="-1"></a><span class="co"># behavior.</span></span>
-<span id="cb1-1151"><a href="#cb1-1151" aria-hidden="true" tabindex="-1"></a><span class="fu">experimental_skip_move_to_device</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1152"><a href="#cb1-1152" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1153"><a href="#cb1-1153" aria-hidden="true" tabindex="-1"></a><span class="co"># Use custom kernels, e.g. MegaBlocks.</span></span>
-<span id="cb1-1154"><a href="#cb1-1154" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1155"><a href="#cb1-1155" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1156"><a href="#cb1-1156" aria-hidden="true" tabindex="-1"></a><span class="co"># Model loading quantization config</span></span>
-<span id="cb1-1157"><a href="#cb1-1157" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config</span><span class="kw">:</span><span class="at"> Literal['Mxfp4Config'] | None</span></span>
-<span id="cb1-1158"><a href="#cb1-1158" aria-hidden="true" tabindex="-1"></a><span class="co"># kwargs for model quantization config</span></span>
-<span id="cb1-1159"><a href="#cb1-1159" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1160"><a href="#cb1-1160" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1161"><a href="#cb1-1161" aria-hidden="true" tabindex="-1"></a><span class="co"># Where to save the full-finetuned model to</span></span>
-<span id="cb1-1162"><a href="#cb1-1162" aria-hidden="true" tabindex="-1"></a><span class="fu">output_dir</span><span class="kw">:</span><span class="at"> str = ./model-out</span></span>
-<span id="cb1-1163"><a href="#cb1-1163" aria-hidden="true" tabindex="-1"></a><span class="co"># push checkpoints to hub</span></span>
-<span id="cb1-1164"><a href="#cb1-1164" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_model_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1165"><a href="#cb1-1165" aria-hidden="true" tabindex="-1"></a><span class="co"># how to push checkpoints to hub</span></span>
-<span id="cb1-1166"><a href="#cb1-1166" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1167"><a href="#cb1-1167" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save the model using safetensors format. Defaults to True.</span></span>
-<span id="cb1-1168"><a href="#cb1-1168" aria-hidden="true" tabindex="-1"></a><span class="fu">save_safetensors</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1169"><a href="#cb1-1169" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1170"><a href="#cb1-1170" aria-hidden="true" tabindex="-1"></a><span class="co"># This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer</span></span>
-<span id="cb1-1171"><a href="#cb1-1171" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_8bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1172"><a href="#cb1-1172" aria-hidden="true" tabindex="-1"></a><span class="co"># Use bitsandbytes 4 bit</span></span>
-<span id="cb1-1173"><a href="#cb1-1173" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_4bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1174"><a href="#cb1-1174" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1175"><a href="#cb1-1175" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to use 'lora' or 'qlora' or leave blank to train all parameters in</span></span>
-<span id="cb1-1176"><a href="#cb1-1176" aria-hidden="true" tabindex="-1"></a><span class="co"># original model</span></span>
-<span id="cb1-1177"><a href="#cb1-1177" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1178"><a href="#cb1-1178" aria-hidden="true" tabindex="-1"></a><span class="co"># If you already have a lora model trained that you want to load, put that here. This</span></span>
-<span id="cb1-1179"><a href="#cb1-1179" aria-hidden="true" tabindex="-1"></a><span class="co"># means after training, if you want to test the model, you should set this to the value</span></span>
-<span id="cb1-1180"><a href="#cb1-1180" aria-hidden="true" tabindex="-1"></a><span class="co"># of `output_dir`. Note that if you merge an adapter to the base model, a new</span></span>
-<span id="cb1-1181"><a href="#cb1-1181" aria-hidden="true" tabindex="-1"></a><span class="co"># subdirectory `merged` will be created under the `output_dir`.</span></span>
-<span id="cb1-1182"><a href="#cb1-1182" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_model_dir</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1183"><a href="#cb1-1183" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1184"><a href="#cb1-1184" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1185"><a href="#cb1-1185" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_fan_in_fan_out</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1186"><a href="#cb1-1186" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_modules</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
-<span id="cb1-1187"><a href="#cb1-1187" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_parameters</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
-<span id="cb1-1188"><a href="#cb1-1188" aria-hidden="true" tabindex="-1"></a><span class="co"># If true, will target all linear modules</span></span>
-<span id="cb1-1189"><a href="#cb1-1189" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_linear</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1190"><a href="#cb1-1190" aria-hidden="true" tabindex="-1"></a><span class="co"># If you added new tokens to the tokenizer, you may need to save some LoRA modules</span></span>
-<span id="cb1-1191"><a href="#cb1-1191" aria-hidden="true" tabindex="-1"></a><span class="co"># because they need to know the new tokens. For LLaMA and Mistral, you need to save</span></span>
-<span id="cb1-1192"><a href="#cb1-1192" aria-hidden="true" tabindex="-1"></a><span class="co"># `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts</span></span>
-<span id="cb1-1193"><a href="#cb1-1193" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens to embeddings, and `lm_head` converts embeddings to token probabilities.</span></span>
-<span id="cb1-1194"><a href="#cb1-1194" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_modules_to_save</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1195"><a href="#cb1-1195" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_dropout</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-1196"><a href="#cb1-1196" aria-hidden="true" tabindex="-1"></a><span class="co"># The layer indices to transform, otherwise, apply to all layers</span></span>
-<span id="cb1-1197"><a href="#cb1-1197" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_to_transform</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
-<span id="cb1-1198"><a href="#cb1-1198" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_pattern</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1199"><a href="#cb1-1199" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1200"><a href="#cb1-1200" aria-hidden="true" tabindex="-1"></a><span class="fu">peft</span><span class="kw">:</span><span class="at"> PeftConfig | None</span></span>
-<span id="cb1-1201"><a href="#cb1-1201" aria-hidden="true" tabindex="-1"></a><span class="co">  # For PeftConfig:</span></span>
-<span id="cb1-1202"><a href="#cb1-1202" aria-hidden="true" tabindex="-1"></a><span class="co">  # Configuration options for loftq initialization for LoRA</span></span>
-<span id="cb1-1203"><a href="#cb1-1203" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">loftq_config</span><span class="kw">:</span><span class="at"> LoftQConfig | None</span></span>
-<span id="cb1-1204"><a href="#cb1-1204" aria-hidden="true" tabindex="-1"></a><span class="co">    # For LoftQConfig:</span></span>
-<span id="cb1-1205"><a href="#cb1-1205" aria-hidden="true" tabindex="-1"></a><span class="co">    # typically 4 bits</span></span>
-<span id="cb1-1206"><a href="#cb1-1206" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">loftq_bits</span><span class="kw">:</span><span class="at"> int = 4</span></span>
-<span id="cb1-1207"><a href="#cb1-1207" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1208"><a href="#cb1-1208" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use DoRA.</span></span>
-<span id="cb1-1209"><a href="#cb1-1209" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_dora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1210"><a href="#cb1-1210" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use RSLoRA.</span></span>
-<span id="cb1-1211"><a href="#cb1-1211" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_rslora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1212"><a href="#cb1-1212" aria-hidden="true" tabindex="-1"></a><span class="co"># List of layer indices to replicate.</span></span>
-<span id="cb1-1213"><a href="#cb1-1213" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layer_replication</span><span class="kw">:</span><span class="at"> list[tuple[int, int]] | None</span></span>
-<span id="cb1-1214"><a href="#cb1-1214" aria-hidden="true" tabindex="-1"></a><span class="co"># How to initialize LoRA weights. Default to True which is MS original implementation.</span></span>
-<span id="cb1-1215"><a href="#cb1-1215" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_init_lora_weights</span><span class="kw">:</span><span class="at"> bool | str | None</span></span>
-<span id="cb1-1216"><a href="#cb1-1216" aria-hidden="true" tabindex="-1"></a><span class="co"># A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict</span></span>
-<span id="cb1-1217"><a href="#cb1-1217" aria-hidden="true" tabindex="-1"></a><span class="co"># mapping an embedding layer name to its trainable token indices. See</span></span>
-<span id="cb1-1218"><a href="#cb1-1218" aria-hidden="true" tabindex="-1"></a><span class="co"># https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-</span></span>
-<span id="cb1-1219"><a href="#cb1-1219" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens-alongside-lora</span></span>
-<span id="cb1-1220"><a href="#cb1-1220" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_trainable_token_indices</span><span class="kw">:</span><span class="at"> list[int] | dict[str, list[int]] | None</span></span>
-<span id="cb1-1221"><a href="#cb1-1221" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to tie adapter weights for tied model weights. See</span></span>
-<span id="cb1-1222"><a href="#cb1-1222" aria-hidden="true" tabindex="-1"></a><span class="co"># https://github.com/huggingface/peft/issues/2864</span></span>
-<span id="cb1-1223"><a href="#cb1-1223" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_ensure_weight_tying</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1224"><a href="#cb1-1224" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.</span></span>
-<span id="cb1-1225"><a href="#cb1-1225" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_autocast_adapter_dtype</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1226"><a href="#cb1-1226" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1227"><a href="#cb1-1227" aria-hidden="true" tabindex="-1"></a><span class="co"># load qlora model in sharded format for FSDP using answer.ai technique.</span></span>
-<span id="cb1-1228"><a href="#cb1-1228" aria-hidden="true" tabindex="-1"></a><span class="fu">qlora_sharded_model_loading</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1229"><a href="#cb1-1229" aria-hidden="true" tabindex="-1"></a><span class="co"># Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it</span></span>
-<span id="cb1-1230"><a href="#cb1-1230" aria-hidden="true" tabindex="-1"></a><span class="co"># takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge</span></span>
-<span id="cb1-1231"><a href="#cb1-1231" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_on_cpu</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1232"><a href="#cb1-1232" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether you are training a 4-bit GPTQ quantized model</span></span>
-<span id="cb1-1233"><a href="#cb1-1233" aria-hidden="true" tabindex="-1"></a><span class="fu">gptq</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1234"><a href="#cb1-1234" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the bnb 4bit quantization configuration</span></span>
-<span id="cb1-1235"><a href="#cb1-1235" aria-hidden="true" tabindex="-1"></a><span class="fu">bnb_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1236"><a href="#cb1-1236" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1237"><a href="#cb1-1237" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.</span></span>
-<span id="cb1-1238"><a href="#cb1-1238" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1239"><a href="#cb1-1239" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate for lora embedding layers. Default value is 1e-6.</span></span>
-<span id="cb1-1240"><a href="#cb1-1240" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_embedding</span><span class="kw">:</span><span class="at"> float | None = 1e-06</span></span>
-<span id="cb1-1241"><a href="#cb1-1241" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1242"><a href="#cb1-1242" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_lora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1243"><a href="#cb1-1243" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1244"><a href="#cb1-1244" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ReLoRA. Use with jagged_restart_*steps options.</span></span>
-<span id="cb1-1245"><a href="#cb1-1245" aria-hidden="true" tabindex="-1"></a><span class="fu">relora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1246"><a href="#cb1-1246" aria-hidden="true" tabindex="-1"></a><span class="co"># threshold for optimizer magnitude when pruning</span></span>
-<span id="cb1-1247"><a href="#cb1-1247" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_prune_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1248"><a href="#cb1-1248" aria-hidden="true" tabindex="-1"></a><span class="co"># True to perform lora weight merges on cpu during restarts, for modest gpu memory</span></span>
-<span id="cb1-1249"><a href="#cb1-1249" aria-hidden="true" tabindex="-1"></a><span class="co"># savings</span></span>
-<span id="cb1-1250"><a href="#cb1-1250" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_cpu_offload</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1251"><a href="#cb1-1251" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1252"><a href="#cb1-1252" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to reset for jagged restarts</span></span>
-<span id="cb1-1253"><a href="#cb1-1253" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1254"><a href="#cb1-1254" aria-hidden="true" tabindex="-1"></a><span class="co"># how many warmup steps to take after reset for jagged restarts</span></span>
-<span id="cb1-1255"><a href="#cb1-1255" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1256"><a href="#cb1-1256" aria-hidden="true" tabindex="-1"></a><span class="co"># how many anneal steps to take before reset for jagged restarts</span></span>
-<span id="cb1-1257"><a href="#cb1-1257" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_anneal_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1258"><a href="#cb1-1258" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1259"><a href="#cb1-1259" aria-hidden="true" tabindex="-1"></a><span class="co"># If greater than 1, backpropagation will be skipped and the gradients will be</span></span>
-<span id="cb1-1260"><a href="#cb1-1260" aria-hidden="true" tabindex="-1"></a><span class="co"># accumulated for the given number of steps.</span></span>
-<span id="cb1-1261"><a href="#cb1-1261" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_accumulation_steps</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1262"><a href="#cb1-1262" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples to include in each batch. This is the number of samples sent to</span></span>
-<span id="cb1-1263"><a href="#cb1-1263" aria-hidden="true" tabindex="-1"></a><span class="co"># each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps</span></span>
-<span id="cb1-1264"><a href="#cb1-1264" aria-hidden="true" tabindex="-1"></a><span class="fu">micro_batch_size</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1265"><a href="#cb1-1265" aria-hidden="true" tabindex="-1"></a><span class="co"># Total batch size, we do not recommended setting this manually</span></span>
-<span id="cb1-1266"><a href="#cb1-1266" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1267"><a href="#cb1-1267" aria-hidden="true" tabindex="-1"></a><span class="co"># per gpu micro batch size for evals, defaults to value of micro_batch_size</span></span>
-<span id="cb1-1268"><a href="#cb1-1268" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1269"><a href="#cb1-1269" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1270"><a href="#cb1-1270" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to find batch size that fits in memory. Passed to underlying transformers</span></span>
-<span id="cb1-1271"><a href="#cb1-1271" aria-hidden="true" tabindex="-1"></a><span class="co"># Trainer</span></span>
-<span id="cb1-1272"><a href="#cb1-1272" aria-hidden="true" tabindex="-1"></a><span class="fu">auto_find_batch_size</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1109"><a href="#cb1-1109" aria-hidden="true" tabindex="-1"></a><span class="fu">is_falcon_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1110"><a href="#cb1-1110" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
+<span id="cb1-1111"><a href="#cb1-1111" aria-hidden="true" tabindex="-1"></a><span class="fu">is_llama_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1112"><a href="#cb1-1112" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on. Please note that if</span></span>
+<span id="cb1-1113"><a href="#cb1-1113" aria-hidden="true" tabindex="-1"></a><span class="co"># you set this to true, `padding_side` will be set to 'left' by default</span></span>
+<span id="cb1-1114"><a href="#cb1-1114" aria-hidden="true" tabindex="-1"></a><span class="fu">is_mistral_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1115"><a href="#cb1-1115" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
+<span id="cb1-1116"><a href="#cb1-1116" aria-hidden="true" tabindex="-1"></a><span class="fu">is_qwen_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1117"><a href="#cb1-1117" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1118"><a href="#cb1-1118" aria-hidden="true" tabindex="-1"></a><span class="co"># Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available</span></span>
+<span id="cb1-1119"><a href="#cb1-1119" aria-hidden="true" tabindex="-1"></a><span class="co"># plugins or doc below for more details.</span></span>
+<span id="cb1-1120"><a href="#cb1-1120" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/custom_integrations.html</span></span>
+<span id="cb1-1121"><a href="#cb1-1121" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1122"><a href="#cb1-1122" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1123"><a href="#cb1-1123" aria-hidden="true" tabindex="-1"></a><span class="co"># This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This</span></span>
+<span id="cb1-1124"><a href="#cb1-1124" aria-hidden="true" tabindex="-1"></a><span class="co"># can also be a relative path to a model on disk</span></span>
+<span id="cb1-1125"><a href="#cb1-1125" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> str (required)</span></span>
+<span id="cb1-1126"><a href="#cb1-1126" aria-hidden="true" tabindex="-1"></a><span class="co"># If the base_model repo on hf hub doesn't include configuration .json files, You can</span></span>
+<span id="cb1-1127"><a href="#cb1-1127" aria-hidden="true" tabindex="-1"></a><span class="co"># set that here, or leave this empty to default to base_model</span></span>
+<span id="cb1-1128"><a href="#cb1-1128" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1129"><a href="#cb1-1129" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to</span></span>
+<span id="cb1-1130"><a href="#cb1-1130" aria-hidden="true" tabindex="-1"></a><span class="co"># AutoConfig.</span></span>
+<span id="cb1-1131"><a href="#cb1-1131" aria-hidden="true" tabindex="-1"></a><span class="fu">cls_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1132"><a href="#cb1-1132" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional tokenizer configuration path in case you want to use a different tokenizer</span></span>
+<span id="cb1-1133"><a href="#cb1-1133" aria-hidden="true" tabindex="-1"></a><span class="co"># than the one defined in the base model</span></span>
+<span id="cb1-1134"><a href="#cb1-1134" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1135"><a href="#cb1-1135" aria-hidden="true" tabindex="-1"></a><span class="co"># use_fast option for tokenizer loading from_pretrained, default to True</span></span>
+<span id="cb1-1136"><a href="#cb1-1136" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_fast</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1137"><a href="#cb1-1137" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use the legacy tokenizer setting, defaults to True</span></span>
+<span id="cb1-1138"><a href="#cb1-1138" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_legacy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1139"><a href="#cb1-1139" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use mistral-common tokenizer. If set to True, it will use the mistral-</span></span>
+<span id="cb1-1140"><a href="#cb1-1140" aria-hidden="true" tabindex="-1"></a><span class="co"># common tokenizer.</span></span>
+<span id="cb1-1141"><a href="#cb1-1141" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_mistral_common</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1142"><a href="#cb1-1142" aria-hidden="true" tabindex="-1"></a><span class="co"># Corresponding tokenizer for the model AutoTokenizer is a good choice</span></span>
+<span id="cb1-1143"><a href="#cb1-1143" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1144"><a href="#cb1-1144" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers processor class</span></span>
+<span id="cb1-1145"><a href="#cb1-1145" aria-hidden="true" tabindex="-1"></a><span class="fu">processor_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1146"><a href="#cb1-1146" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save jinja files for tokenizer, transformers default is True</span></span>
+<span id="cb1-1147"><a href="#cb1-1147" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_save_jinja_files</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1148"><a href="#cb1-1148" aria-hidden="true" tabindex="-1"></a><span class="co"># Trust remote code for untrusted source</span></span>
+<span id="cb1-1149"><a href="#cb1-1149" aria-hidden="true" tabindex="-1"></a><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1150"><a href="#cb1-1150" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1151"><a href="#cb1-1151" aria-hidden="true" tabindex="-1"></a><span class="co"># Don't move the model to the device before sharding. Set to `false` to revert to legacy</span></span>
+<span id="cb1-1152"><a href="#cb1-1152" aria-hidden="true" tabindex="-1"></a><span class="co"># behavior.</span></span>
+<span id="cb1-1153"><a href="#cb1-1153" aria-hidden="true" tabindex="-1"></a><span class="fu">experimental_skip_move_to_device</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1154"><a href="#cb1-1154" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1155"><a href="#cb1-1155" aria-hidden="true" tabindex="-1"></a><span class="co"># Use custom kernels, e.g. MegaBlocks.</span></span>
+<span id="cb1-1156"><a href="#cb1-1156" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1157"><a href="#cb1-1157" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1158"><a href="#cb1-1158" aria-hidden="true" tabindex="-1"></a><span class="co"># Model loading quantization config</span></span>
+<span id="cb1-1159"><a href="#cb1-1159" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config</span><span class="kw">:</span><span class="at"> Literal['Mxfp4Config'] | None</span></span>
+<span id="cb1-1160"><a href="#cb1-1160" aria-hidden="true" tabindex="-1"></a><span class="co"># kwargs for model quantization config</span></span>
+<span id="cb1-1161"><a href="#cb1-1161" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1162"><a href="#cb1-1162" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1163"><a href="#cb1-1163" aria-hidden="true" tabindex="-1"></a><span class="co"># Where to save the full-finetuned model to</span></span>
+<span id="cb1-1164"><a href="#cb1-1164" aria-hidden="true" tabindex="-1"></a><span class="fu">output_dir</span><span class="kw">:</span><span class="at"> str = ./model-out</span></span>
+<span id="cb1-1165"><a href="#cb1-1165" aria-hidden="true" tabindex="-1"></a><span class="co"># push checkpoints to hub</span></span>
+<span id="cb1-1166"><a href="#cb1-1166" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_model_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1167"><a href="#cb1-1167" aria-hidden="true" tabindex="-1"></a><span class="co"># how to push checkpoints to hub</span></span>
+<span id="cb1-1168"><a href="#cb1-1168" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1169"><a href="#cb1-1169" aria-hidden="true" tabindex="-1"></a><span class="co"># branch/revision to push to on hub (default: main)</span></span>
+<span id="cb1-1170"><a href="#cb1-1170" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1171"><a href="#cb1-1171" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save the model using safetensors format. Defaults to True.</span></span>
+<span id="cb1-1172"><a href="#cb1-1172" aria-hidden="true" tabindex="-1"></a><span class="fu">save_safetensors</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1173"><a href="#cb1-1173" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1174"><a href="#cb1-1174" aria-hidden="true" tabindex="-1"></a><span class="co"># This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer</span></span>
+<span id="cb1-1175"><a href="#cb1-1175" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_8bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1176"><a href="#cb1-1176" aria-hidden="true" tabindex="-1"></a><span class="co"># Use bitsandbytes 4 bit</span></span>
+<span id="cb1-1177"><a href="#cb1-1177" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_4bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1178"><a href="#cb1-1178" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1179"><a href="#cb1-1179" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to use 'lora' or 'qlora' or leave blank to train all parameters in</span></span>
+<span id="cb1-1180"><a href="#cb1-1180" aria-hidden="true" tabindex="-1"></a><span class="co"># original model</span></span>
+<span id="cb1-1181"><a href="#cb1-1181" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1182"><a href="#cb1-1182" aria-hidden="true" tabindex="-1"></a><span class="co"># If you already have a lora model trained that you want to load, put that here. This</span></span>
+<span id="cb1-1183"><a href="#cb1-1183" aria-hidden="true" tabindex="-1"></a><span class="co"># means after training, if you want to test the model, you should set this to the value</span></span>
+<span id="cb1-1184"><a href="#cb1-1184" aria-hidden="true" tabindex="-1"></a><span class="co"># of `output_dir`. Note that if you merge an adapter to the base model, a new</span></span>
+<span id="cb1-1185"><a href="#cb1-1185" aria-hidden="true" tabindex="-1"></a><span class="co"># subdirectory `merged` will be created under the `output_dir`.</span></span>
+<span id="cb1-1186"><a href="#cb1-1186" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_model_dir</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1187"><a href="#cb1-1187" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1188"><a href="#cb1-1188" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1189"><a href="#cb1-1189" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_fan_in_fan_out</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1190"><a href="#cb1-1190" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_modules</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
+<span id="cb1-1191"><a href="#cb1-1191" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_parameters</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
+<span id="cb1-1192"><a href="#cb1-1192" aria-hidden="true" tabindex="-1"></a><span class="co"># If true, will target all linear modules</span></span>
+<span id="cb1-1193"><a href="#cb1-1193" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_linear</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1194"><a href="#cb1-1194" aria-hidden="true" tabindex="-1"></a><span class="co"># If you added new tokens to the tokenizer, you may need to save some LoRA modules</span></span>
+<span id="cb1-1195"><a href="#cb1-1195" aria-hidden="true" tabindex="-1"></a><span class="co"># because they need to know the new tokens. For LLaMA and Mistral, you need to save</span></span>
+<span id="cb1-1196"><a href="#cb1-1196" aria-hidden="true" tabindex="-1"></a><span class="co"># `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts</span></span>
+<span id="cb1-1197"><a href="#cb1-1197" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens to embeddings, and `lm_head` converts embeddings to token probabilities.</span></span>
+<span id="cb1-1198"><a href="#cb1-1198" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_modules_to_save</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1199"><a href="#cb1-1199" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_dropout</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-1200"><a href="#cb1-1200" aria-hidden="true" tabindex="-1"></a><span class="co"># The layer indices to transform, otherwise, apply to all layers</span></span>
+<span id="cb1-1201"><a href="#cb1-1201" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_to_transform</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
+<span id="cb1-1202"><a href="#cb1-1202" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_pattern</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1203"><a href="#cb1-1203" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1204"><a href="#cb1-1204" aria-hidden="true" tabindex="-1"></a><span class="fu">peft</span><span class="kw">:</span><span class="at"> PeftConfig | None</span></span>
+<span id="cb1-1205"><a href="#cb1-1205" aria-hidden="true" tabindex="-1"></a><span class="co">  # For PeftConfig:</span></span>
+<span id="cb1-1206"><a href="#cb1-1206" aria-hidden="true" tabindex="-1"></a><span class="co">  # Configuration options for loftq initialization for LoRA</span></span>
+<span id="cb1-1207"><a href="#cb1-1207" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">loftq_config</span><span class="kw">:</span><span class="at"> LoftQConfig | None</span></span>
+<span id="cb1-1208"><a href="#cb1-1208" aria-hidden="true" tabindex="-1"></a><span class="co">    # For LoftQConfig:</span></span>
+<span id="cb1-1209"><a href="#cb1-1209" aria-hidden="true" tabindex="-1"></a><span class="co">    # typically 4 bits</span></span>
+<span id="cb1-1210"><a href="#cb1-1210" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">loftq_bits</span><span class="kw">:</span><span class="at"> int = 4</span></span>
+<span id="cb1-1211"><a href="#cb1-1211" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1212"><a href="#cb1-1212" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use DoRA.</span></span>
+<span id="cb1-1213"><a href="#cb1-1213" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_dora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1214"><a href="#cb1-1214" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use RSLoRA.</span></span>
+<span id="cb1-1215"><a href="#cb1-1215" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_rslora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1216"><a href="#cb1-1216" aria-hidden="true" tabindex="-1"></a><span class="co"># List of layer indices to replicate.</span></span>
+<span id="cb1-1217"><a href="#cb1-1217" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layer_replication</span><span class="kw">:</span><span class="at"> list[tuple[int, int]] | None</span></span>
+<span id="cb1-1218"><a href="#cb1-1218" aria-hidden="true" tabindex="-1"></a><span class="co"># How to initialize LoRA weights. Default to True which is MS original implementation.</span></span>
+<span id="cb1-1219"><a href="#cb1-1219" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_init_lora_weights</span><span class="kw">:</span><span class="at"> bool | str | None</span></span>
+<span id="cb1-1220"><a href="#cb1-1220" aria-hidden="true" tabindex="-1"></a><span class="co"># A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict</span></span>
+<span id="cb1-1221"><a href="#cb1-1221" aria-hidden="true" tabindex="-1"></a><span class="co"># mapping an embedding layer name to its trainable token indices. See</span></span>
+<span id="cb1-1222"><a href="#cb1-1222" aria-hidden="true" tabindex="-1"></a><span class="co"># https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-</span></span>
+<span id="cb1-1223"><a href="#cb1-1223" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens-alongside-lora</span></span>
+<span id="cb1-1224"><a href="#cb1-1224" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_trainable_token_indices</span><span class="kw">:</span><span class="at"> list[int] | dict[str, list[int]] | None</span></span>
+<span id="cb1-1225"><a href="#cb1-1225" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to tie adapter weights for tied model weights. See</span></span>
+<span id="cb1-1226"><a href="#cb1-1226" aria-hidden="true" tabindex="-1"></a><span class="co"># https://github.com/huggingface/peft/issues/2864</span></span>
+<span id="cb1-1227"><a href="#cb1-1227" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_ensure_weight_tying</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1228"><a href="#cb1-1228" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.</span></span>
+<span id="cb1-1229"><a href="#cb1-1229" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_autocast_adapter_dtype</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1230"><a href="#cb1-1230" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1231"><a href="#cb1-1231" aria-hidden="true" tabindex="-1"></a><span class="co"># load qlora model in sharded format for FSDP using answer.ai technique.</span></span>
+<span id="cb1-1232"><a href="#cb1-1232" aria-hidden="true" tabindex="-1"></a><span class="fu">qlora_sharded_model_loading</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1233"><a href="#cb1-1233" aria-hidden="true" tabindex="-1"></a><span class="co"># Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it</span></span>
+<span id="cb1-1234"><a href="#cb1-1234" aria-hidden="true" tabindex="-1"></a><span class="co"># takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge</span></span>
+<span id="cb1-1235"><a href="#cb1-1235" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_on_cpu</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1236"><a href="#cb1-1236" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether you are training a 4-bit GPTQ quantized model</span></span>
+<span id="cb1-1237"><a href="#cb1-1237" aria-hidden="true" tabindex="-1"></a><span class="fu">gptq</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1238"><a href="#cb1-1238" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the bnb 4bit quantization configuration</span></span>
+<span id="cb1-1239"><a href="#cb1-1239" aria-hidden="true" tabindex="-1"></a><span class="fu">bnb_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1240"><a href="#cb1-1240" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1241"><a href="#cb1-1241" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.</span></span>
+<span id="cb1-1242"><a href="#cb1-1242" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1243"><a href="#cb1-1243" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate for lora embedding layers. Default value is 1e-6.</span></span>
+<span id="cb1-1244"><a href="#cb1-1244" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_embedding</span><span class="kw">:</span><span class="at"> float | None = 1e-06</span></span>
+<span id="cb1-1245"><a href="#cb1-1245" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1246"><a href="#cb1-1246" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_lora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1247"><a href="#cb1-1247" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1248"><a href="#cb1-1248" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ReLoRA. Use with jagged_restart_*steps options.</span></span>
+<span id="cb1-1249"><a href="#cb1-1249" aria-hidden="true" tabindex="-1"></a><span class="fu">relora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1250"><a href="#cb1-1250" aria-hidden="true" tabindex="-1"></a><span class="co"># threshold for optimizer magnitude when pruning</span></span>
+<span id="cb1-1251"><a href="#cb1-1251" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_prune_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1252"><a href="#cb1-1252" aria-hidden="true" tabindex="-1"></a><span class="co"># True to perform lora weight merges on cpu during restarts, for modest gpu memory</span></span>
+<span id="cb1-1253"><a href="#cb1-1253" aria-hidden="true" tabindex="-1"></a><span class="co"># savings</span></span>
+<span id="cb1-1254"><a href="#cb1-1254" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_cpu_offload</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1255"><a href="#cb1-1255" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1256"><a href="#cb1-1256" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to reset for jagged restarts</span></span>
+<span id="cb1-1257"><a href="#cb1-1257" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1258"><a href="#cb1-1258" aria-hidden="true" tabindex="-1"></a><span class="co"># how many warmup steps to take after reset for jagged restarts</span></span>
+<span id="cb1-1259"><a href="#cb1-1259" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1260"><a href="#cb1-1260" aria-hidden="true" tabindex="-1"></a><span class="co"># how many anneal steps to take before reset for jagged restarts</span></span>
+<span id="cb1-1261"><a href="#cb1-1261" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_anneal_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1262"><a href="#cb1-1262" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1263"><a href="#cb1-1263" aria-hidden="true" tabindex="-1"></a><span class="co"># If greater than 1, backpropagation will be skipped and the gradients will be</span></span>
+<span id="cb1-1264"><a href="#cb1-1264" aria-hidden="true" tabindex="-1"></a><span class="co"># accumulated for the given number of steps.</span></span>
+<span id="cb1-1265"><a href="#cb1-1265" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_accumulation_steps</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
+<span id="cb1-1266"><a href="#cb1-1266" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples to include in each batch. This is the number of samples sent to</span></span>
+<span id="cb1-1267"><a href="#cb1-1267" aria-hidden="true" tabindex="-1"></a><span class="co"># each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps</span></span>
+<span id="cb1-1268"><a href="#cb1-1268" aria-hidden="true" tabindex="-1"></a><span class="fu">micro_batch_size</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
+<span id="cb1-1269"><a href="#cb1-1269" aria-hidden="true" tabindex="-1"></a><span class="co"># Total batch size, we do not recommended setting this manually</span></span>
+<span id="cb1-1270"><a href="#cb1-1270" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1271"><a href="#cb1-1271" aria-hidden="true" tabindex="-1"></a><span class="co"># per gpu micro batch size for evals, defaults to value of micro_batch_size</span></span>
+<span id="cb1-1272"><a href="#cb1-1272" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
 <span id="cb1-1273"><a href="#cb1-1273" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1274"><a href="#cb1-1274" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to mask out or include the human's prompt from the training labels</span></span>
-<span id="cb1-1275"><a href="#cb1-1275" aria-hidden="true" tabindex="-1"></a><span class="fu">train_on_inputs</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1276"><a href="#cb1-1276" aria-hidden="true" tabindex="-1"></a><span class="co"># Group similarly sized data to minimize padding. May be slower to start, as it must</span></span>
-<span id="cb1-1277"><a href="#cb1-1277" aria-hidden="true" tabindex="-1"></a><span class="co"># download and sort the entire dataset. Note that training loss may have an oscillating</span></span>
-<span id="cb1-1278"><a href="#cb1-1278" aria-hidden="true" tabindex="-1"></a><span class="co"># pattern with this enabled.</span></span>
-<span id="cb1-1279"><a href="#cb1-1279" aria-hidden="true" tabindex="-1"></a><span class="fu">group_by_length</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1280"><a href="#cb1-1280" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1281"><a href="#cb1-1281" aria-hidden="true" tabindex="-1"></a><span class="fu">learning_rate</span><span class="kw">:</span><span class="at"> str | float (required)</span></span>
-<span id="cb1-1282"><a href="#cb1-1282" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1283"><a href="#cb1-1283" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr_scale</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1284"><a href="#cb1-1284" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify weight decay</span></span>
-<span id="cb1-1285"><a href="#cb1-1285" aria-hidden="true" tabindex="-1"></a><span class="fu">weight_decay</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-1286"><a href="#cb1-1286" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify optimizer</span></span>
-<span id="cb1-1287"><a href="#cb1-1287" aria-hidden="true" tabindex="-1"></a><span class="fu">optimizer</span><span class="kw">:</span><span class="at"> OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED</span></span>
-<span id="cb1-1288"><a href="#cb1-1288" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary of arguments to pass to the optimizer</span></span>
-<span id="cb1-1289"><a href="#cb1-1289" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_args</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
-<span id="cb1-1290"><a href="#cb1-1290" aria-hidden="true" tabindex="-1"></a><span class="co"># The target modules to optimize, i.e. the module names that you would like to train,</span></span>
-<span id="cb1-1291"><a href="#cb1-1291" aria-hidden="true" tabindex="-1"></a><span class="co"># right now this is used only for GaLore algorithm</span></span>
-<span id="cb1-1292"><a href="#cb1-1292" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_target_modules</span><span class="kw">:</span><span class="at"> list[str] | Literal['all_linear'] | None</span></span>
-<span id="cb1-1293"><a href="#cb1-1293" aria-hidden="true" tabindex="-1"></a><span class="co"># Path to torch distx for optim 'adamw_anyprecision'</span></span>
-<span id="cb1-1294"><a href="#cb1-1294" aria-hidden="true" tabindex="-1"></a><span class="fu">torchdistx_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1295"><a href="#cb1-1295" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler</span><span class="kw">:</span><span class="at"> SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE</span></span>
-<span id="cb1-1296"><a href="#cb1-1296" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a scheduler and kwargs to use with the optimizer</span></span>
-<span id="cb1-1297"><a href="#cb1-1297" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1298"><a href="#cb1-1298" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_quadratic_warmup</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1299"><a href="#cb1-1299" aria-hidden="true" tabindex="-1"></a><span class="co"># decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of</span></span>
-<span id="cb1-1300"><a href="#cb1-1300" aria-hidden="true" tabindex="-1"></a><span class="co"># peak lr</span></span>
-<span id="cb1-1301"><a href="#cb1-1301" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_min_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1302"><a href="#cb1-1302" aria-hidden="true" tabindex="-1"></a><span class="co"># freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means</span></span>
-<span id="cb1-1303"><a href="#cb1-1303" aria-hidden="true" tabindex="-1"></a><span class="co"># start cosine_min_lr at 80% of training step</span></span>
-<span id="cb1-1304"><a href="#cb1-1304" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_constant_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1305"><a href="#cb1-1305" aria-hidden="true" tabindex="-1"></a><span class="co"># Learning rate div factor</span></span>
-<span id="cb1-1306"><a href="#cb1-1306" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_div_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1307"><a href="#cb1-1307" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1308"><a href="#cb1-1308" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_groups</span><span class="kw">:</span><span class="at"> list[LrGroup] | None</span></span>
-<span id="cb1-1309"><a href="#cb1-1309" aria-hidden="true" tabindex="-1"></a><span class="co">  # For LrGroup:</span></span>
-<span id="cb1-1310"><a href="#cb1-1310" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str (required)</span></span>
-<span id="cb1-1311"><a href="#cb1-1311" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">modules</span><span class="kw">:</span><span class="at"> list[str] (required)</span></span>
-<span id="cb1-1312"><a href="#cb1-1312" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">lr</span><span class="kw">:</span><span class="at"> float (required)</span></span>
-<span id="cb1-1313"><a href="#cb1-1313" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1314"><a href="#cb1-1314" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1315"><a href="#cb1-1315" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1316"><a href="#cb1-1316" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
-<span id="cb1-1317"><a href="#cb1-1317" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon2</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1274"><a href="#cb1-1274" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to find batch size that fits in memory. Passed to underlying transformers</span></span>
+<span id="cb1-1275"><a href="#cb1-1275" aria-hidden="true" tabindex="-1"></a><span class="co"># Trainer</span></span>
+<span id="cb1-1276"><a href="#cb1-1276" aria-hidden="true" tabindex="-1"></a><span class="fu">auto_find_batch_size</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1277"><a href="#cb1-1277" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1278"><a href="#cb1-1278" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to mask out or include the human's prompt from the training labels</span></span>
+<span id="cb1-1279"><a href="#cb1-1279" aria-hidden="true" tabindex="-1"></a><span class="fu">train_on_inputs</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1280"><a href="#cb1-1280" aria-hidden="true" tabindex="-1"></a><span class="co"># Group similarly sized data to minimize padding. May be slower to start, as it must</span></span>
+<span id="cb1-1281"><a href="#cb1-1281" aria-hidden="true" tabindex="-1"></a><span class="co"># download and sort the entire dataset. Note that training loss may have an oscillating</span></span>
+<span id="cb1-1282"><a href="#cb1-1282" aria-hidden="true" tabindex="-1"></a><span class="co"># pattern with this enabled.</span></span>
+<span id="cb1-1283"><a href="#cb1-1283" aria-hidden="true" tabindex="-1"></a><span class="fu">group_by_length</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1284"><a href="#cb1-1284" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1285"><a href="#cb1-1285" aria-hidden="true" tabindex="-1"></a><span class="fu">learning_rate</span><span class="kw">:</span><span class="at"> str | float (required)</span></span>
+<span id="cb1-1286"><a href="#cb1-1286" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1287"><a href="#cb1-1287" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr_scale</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1288"><a href="#cb1-1288" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify weight decay</span></span>
+<span id="cb1-1289"><a href="#cb1-1289" aria-hidden="true" tabindex="-1"></a><span class="fu">weight_decay</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-1290"><a href="#cb1-1290" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify optimizer</span></span>
+<span id="cb1-1291"><a href="#cb1-1291" aria-hidden="true" tabindex="-1"></a><span class="fu">optimizer</span><span class="kw">:</span><span class="at"> OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED</span></span>
+<span id="cb1-1292"><a href="#cb1-1292" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary of arguments to pass to the optimizer</span></span>
+<span id="cb1-1293"><a href="#cb1-1293" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_args</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
+<span id="cb1-1294"><a href="#cb1-1294" aria-hidden="true" tabindex="-1"></a><span class="co"># The target modules to optimize, i.e. the module names that you would like to train,</span></span>
+<span id="cb1-1295"><a href="#cb1-1295" aria-hidden="true" tabindex="-1"></a><span class="co"># right now this is used only for GaLore algorithm</span></span>
+<span id="cb1-1296"><a href="#cb1-1296" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_target_modules</span><span class="kw">:</span><span class="at"> list[str] | Literal['all_linear'] | None</span></span>
+<span id="cb1-1297"><a href="#cb1-1297" aria-hidden="true" tabindex="-1"></a><span class="co"># Path to torch distx for optim 'adamw_anyprecision'</span></span>
+<span id="cb1-1298"><a href="#cb1-1298" aria-hidden="true" tabindex="-1"></a><span class="fu">torchdistx_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1299"><a href="#cb1-1299" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler</span><span class="kw">:</span><span class="at"> SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE</span></span>
+<span id="cb1-1300"><a href="#cb1-1300" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a scheduler and kwargs to use with the optimizer</span></span>
+<span id="cb1-1301"><a href="#cb1-1301" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1302"><a href="#cb1-1302" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_quadratic_warmup</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1303"><a href="#cb1-1303" aria-hidden="true" tabindex="-1"></a><span class="co"># decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of</span></span>
+<span id="cb1-1304"><a href="#cb1-1304" aria-hidden="true" tabindex="-1"></a><span class="co"># peak lr</span></span>
+<span id="cb1-1305"><a href="#cb1-1305" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_min_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1306"><a href="#cb1-1306" aria-hidden="true" tabindex="-1"></a><span class="co"># freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means</span></span>
+<span id="cb1-1307"><a href="#cb1-1307" aria-hidden="true" tabindex="-1"></a><span class="co"># start cosine_min_lr at 80% of training step</span></span>
+<span id="cb1-1308"><a href="#cb1-1308" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_constant_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1309"><a href="#cb1-1309" aria-hidden="true" tabindex="-1"></a><span class="co"># Learning rate div factor</span></span>
+<span id="cb1-1310"><a href="#cb1-1310" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_div_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1311"><a href="#cb1-1311" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1312"><a href="#cb1-1312" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_groups</span><span class="kw">:</span><span class="at"> list[LrGroup] | None</span></span>
+<span id="cb1-1313"><a href="#cb1-1313" aria-hidden="true" tabindex="-1"></a><span class="co">  # For LrGroup:</span></span>
+<span id="cb1-1314"><a href="#cb1-1314" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str (required)</span></span>
+<span id="cb1-1315"><a href="#cb1-1315" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">modules</span><span class="kw">:</span><span class="at"> list[str] (required)</span></span>
+<span id="cb1-1316"><a href="#cb1-1316" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">lr</span><span class="kw">:</span><span class="at"> float (required)</span></span>
+<span id="cb1-1317"><a href="#cb1-1317" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb1-1318"><a href="#cb1-1318" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1319"><a href="#cb1-1319" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta1</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1320"><a href="#cb1-1320" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1321"><a href="#cb1-1321" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta2</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1322"><a href="#cb1-1322" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
-<span id="cb1-1323"><a href="#cb1-1323" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta3</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1324"><a href="#cb1-1324" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1325"><a href="#cb1-1325" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer learning rate</span></span>
-<span id="cb1-1326"><a href="#cb1-1326" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1327"><a href="#cb1-1327" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer momentum</span></span>
-<span id="cb1-1328"><a href="#cb1-1328" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_momentum</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1329"><a href="#cb1-1329" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank</span></span>
-<span id="cb1-1330"><a href="#cb1-1330" aria-hidden="true" tabindex="-1"></a><span class="co"># dimension.</span></span>
-<span id="cb1-1331"><a href="#cb1-1331" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_fraction</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
-<span id="cb1-1332"><a href="#cb1-1332" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may</span></span>
-<span id="cb1-1333"><a href="#cb1-1333" aria-hidden="true" tabindex="-1"></a><span class="co"># be useful to ensure even sharding.</span></span>
-<span id="cb1-1334"><a href="#cb1-1334" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_multiple_of</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1335"><a href="#cb1-1335" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1336"><a href="#cb1-1336" aria-hidden="true" tabindex="-1"></a><span class="co"># Gradient clipping max norm</span></span>
-<span id="cb1-1337"><a href="#cb1-1337" aria-hidden="true" tabindex="-1"></a><span class="fu">max_grad_norm</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1338"><a href="#cb1-1338" aria-hidden="true" tabindex="-1"></a><span class="fu">num_epochs</span><span class="kw">:</span><span class="at"> float = 1.0</span></span>
+<span id="cb1-1319"><a href="#cb1-1319" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1320"><a href="#cb1-1320" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
+<span id="cb1-1321"><a href="#cb1-1321" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon2</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1322"><a href="#cb1-1322" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
+<span id="cb1-1323"><a href="#cb1-1323" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta1</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1324"><a href="#cb1-1324" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
+<span id="cb1-1325"><a href="#cb1-1325" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta2</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1326"><a href="#cb1-1326" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
+<span id="cb1-1327"><a href="#cb1-1327" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta3</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1328"><a href="#cb1-1328" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1329"><a href="#cb1-1329" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer learning rate</span></span>
+<span id="cb1-1330"><a href="#cb1-1330" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1331"><a href="#cb1-1331" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer momentum</span></span>
+<span id="cb1-1332"><a href="#cb1-1332" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_momentum</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1333"><a href="#cb1-1333" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank</span></span>
+<span id="cb1-1334"><a href="#cb1-1334" aria-hidden="true" tabindex="-1"></a><span class="co"># dimension.</span></span>
+<span id="cb1-1335"><a href="#cb1-1335" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_fraction</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
+<span id="cb1-1336"><a href="#cb1-1336" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may</span></span>
+<span id="cb1-1337"><a href="#cb1-1337" aria-hidden="true" tabindex="-1"></a><span class="co"># be useful to ensure even sharding.</span></span>
+<span id="cb1-1338"><a href="#cb1-1338" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_multiple_of</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
 <span id="cb1-1339"><a href="#cb1-1339" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1340"><a href="#cb1-1340" aria-hidden="true" tabindex="-1"></a><span class="fu">use_wandb</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1341"><a href="#cb1-1341" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your wandb run</span></span>
-<span id="cb1-1342"><a href="#cb1-1342" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1343"><a href="#cb1-1343" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the ID of your wandb run</span></span>
-<span id="cb1-1344"><a href="#cb1-1344" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_run_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1345"><a href="#cb1-1345" aria-hidden="true" tabindex="-1"></a><span class="co"># "offline" to save run metadata locally and not sync to the server, "disabled" to turn</span></span>
-<span id="cb1-1346"><a href="#cb1-1346" aria-hidden="true" tabindex="-1"></a><span class="co"># off wandb</span></span>
-<span id="cb1-1347"><a href="#cb1-1347" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1348"><a href="#cb1-1348" aria-hidden="true" tabindex="-1"></a><span class="co"># Your wandb project name</span></span>
-<span id="cb1-1349"><a href="#cb1-1349" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_project</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1350"><a href="#cb1-1350" aria-hidden="true" tabindex="-1"></a><span class="co"># A wandb Team name if using a Team</span></span>
-<span id="cb1-1351"><a href="#cb1-1351" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_entity</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1352"><a href="#cb1-1352" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_watch</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1353"><a href="#cb1-1353" aria-hidden="true" tabindex="-1"></a><span class="co"># "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only</span></span>
-<span id="cb1-1354"><a href="#cb1-1354" aria-hidden="true" tabindex="-1"></a><span class="co"># at the end of training</span></span>
-<span id="cb1-1355"><a href="#cb1-1355" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_log_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1356"><a href="#cb1-1356" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1357"><a href="#cb1-1357" aria-hidden="true" tabindex="-1"></a><span class="fu">use_mlflow</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1358"><a href="#cb1-1358" aria-hidden="true" tabindex="-1"></a><span class="co"># URI to mlflow</span></span>
-<span id="cb1-1359"><a href="#cb1-1359" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_tracking_uri</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1360"><a href="#cb1-1360" aria-hidden="true" tabindex="-1"></a><span class="co"># Your experiment name</span></span>
-<span id="cb1-1361"><a href="#cb1-1361" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_experiment_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1362"><a href="#cb1-1362" aria-hidden="true" tabindex="-1"></a><span class="co"># Your run name</span></span>
-<span id="cb1-1363"><a href="#cb1-1363" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1364"><a href="#cb1-1364" aria-hidden="true" tabindex="-1"></a><span class="co"># set to true to copy each saved checkpoint on each save to mlflow artifact registry</span></span>
-<span id="cb1-1365"><a href="#cb1-1365" aria-hidden="true" tabindex="-1"></a><span class="fu">hf_mlflow_log_artifacts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1366"><a href="#cb1-1366" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1367"><a href="#cb1-1367" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable or disable Comet integration.</span></span>
-<span id="cb1-1368"><a href="#cb1-1368" aria-hidden="true" tabindex="-1"></a><span class="fu">use_comet</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1369"><a href="#cb1-1369" aria-hidden="true" tabindex="-1"></a><span class="co"># API key for Comet. Recommended to set via `comet login`.</span></span>
-<span id="cb1-1370"><a href="#cb1-1370" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_api_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1371"><a href="#cb1-1371" aria-hidden="true" tabindex="-1"></a><span class="co"># Workspace name in Comet. Defaults to the user's default workspace.</span></span>
-<span id="cb1-1372"><a href="#cb1-1372" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_workspace</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1373"><a href="#cb1-1373" aria-hidden="true" tabindex="-1"></a><span class="co"># Project name in Comet. Defaults to Uncategorized.</span></span>
-<span id="cb1-1374"><a href="#cb1-1374" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1375"><a href="#cb1-1375" aria-hidden="true" tabindex="-1"></a><span class="co"># Identifier for the experiment. Used to append data to an existing experiment or</span></span>
-<span id="cb1-1376"><a href="#cb1-1376" aria-hidden="true" tabindex="-1"></a><span class="co"># control the key of new experiments. Default to a random key.</span></span>
-<span id="cb1-1377"><a href="#cb1-1377" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1378"><a href="#cb1-1378" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a new experiment ("create") or log to an existing one ("get"). Default</span></span>
-<span id="cb1-1379"><a href="#cb1-1379" aria-hidden="true" tabindex="-1"></a><span class="co"># ("get_or_create") auto-selects based on configuration.</span></span>
-<span id="cb1-1380"><a href="#cb1-1380" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1381"><a href="#cb1-1381" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to True to log data to Comet server, or False for offline storage. Default is</span></span>
-<span id="cb1-1382"><a href="#cb1-1382" aria-hidden="true" tabindex="-1"></a><span class="co"># True.</span></span>
-<span id="cb1-1383"><a href="#cb1-1383" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_online</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1384"><a href="#cb1-1384" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary for additional configuration settings, see the doc for more details.</span></span>
-<span id="cb1-1385"><a href="#cb1-1385" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1386"><a href="#cb1-1386" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1387"><a href="#cb1-1387" aria-hidden="true" tabindex="-1"></a><span class="fu">use_trackio</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1388"><a href="#cb1-1388" aria-hidden="true" tabindex="-1"></a><span class="co"># Your trackio project name</span></span>
-<span id="cb1-1389"><a href="#cb1-1389" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1390"><a href="#cb1-1390" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your trackio run</span></span>
-<span id="cb1-1391"><a href="#cb1-1391" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1392"><a href="#cb1-1392" aria-hidden="true" tabindex="-1"></a><span class="co"># Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)</span></span>
-<span id="cb1-1393"><a href="#cb1-1393" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_space_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1394"><a href="#cb1-1394" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1395"><a href="#cb1-1395" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable OpenTelemetry metrics collection and Prometheus export</span></span>
-<span id="cb1-1396"><a href="#cb1-1396" aria-hidden="true" tabindex="-1"></a><span class="fu">use_otel_metrics</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1397"><a href="#cb1-1397" aria-hidden="true" tabindex="-1"></a><span class="co"># Host to bind the OpenTelemetry metrics server to</span></span>
-<span id="cb1-1398"><a href="#cb1-1398" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_host</span><span class="kw">:</span><span class="at"> str | None = localhost</span></span>
-<span id="cb1-1399"><a href="#cb1-1399" aria-hidden="true" tabindex="-1"></a><span class="co"># Port for the Prometheus metrics HTTP server</span></span>
-<span id="cb1-1400"><a href="#cb1-1400" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_port</span><span class="kw">:</span><span class="at"> int | None = 8000</span></span>
-<span id="cb1-1401"><a href="#cb1-1401" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1402"><a href="#cb1-1402" aria-hidden="true" tabindex="-1"></a><span class="co"># the number of activate layers in LISA</span></span>
-<span id="cb1-1403"><a href="#cb1-1403" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_n_layers</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1404"><a href="#cb1-1404" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to switch layers in LISA</span></span>
-<span id="cb1-1405"><a href="#cb1-1405" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_step_interval</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1406"><a href="#cb1-1406" aria-hidden="true" tabindex="-1"></a><span class="co"># path under the model to access the layers</span></span>
-<span id="cb1-1407"><a href="#cb1-1407" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_layers_attribute</span><span class="kw">:</span><span class="at"> str | None = model.layers</span></span>
-<span id="cb1-1408"><a href="#cb1-1408" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1409"><a href="#cb1-1409" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_title</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1410"><a href="#cb1-1410" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_share</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1411"><a href="#cb1-1411" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1412"><a href="#cb1-1412" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_port</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1413"><a href="#cb1-1413" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1414"><a href="#cb1-1414" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1415"><a href="#cb1-1415" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1416"><a href="#cb1-1416" aria-hidden="true" tabindex="-1"></a><span class="fu">use_ray</span><span class="kw">:</span><span class="at"> bool = False</span></span>
-<span id="cb1-1417"><a href="#cb1-1417" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1418"><a href="#cb1-1418" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_num_workers</span><span class="kw">:</span><span class="at"> int = 1</span></span>
-<span id="cb1-1419"><a href="#cb1-1419" aria-hidden="true" tabindex="-1"></a><span class="fu">resources_per_worker</span><span class="kw">:</span><span class="at"> dict</span></span>
-<span id="cb1-1420"><a href="#cb1-1420" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1421"><a href="#cb1-1421" aria-hidden="true" tabindex="-1"></a><span class="co"># The size of the image to resize to. It can be an integer (resized into padded-square</span></span>
-<span id="cb1-1422"><a href="#cb1-1422" aria-hidden="true" tabindex="-1"></a><span class="co"># image) or a tuple (width, height).If not provided, we will attempt to load from</span></span>
-<span id="cb1-1423"><a href="#cb1-1423" aria-hidden="true" tabindex="-1"></a><span class="co"># preprocessor.size, otherwise, images won't be resized.</span></span>
-<span id="cb1-1424"><a href="#cb1-1424" aria-hidden="true" tabindex="-1"></a><span class="fu">image_size</span><span class="kw">:</span><span class="at"> int | tuple[int, int] | None</span></span>
-<span id="cb1-1425"><a href="#cb1-1425" aria-hidden="true" tabindex="-1"></a><span class="co"># The resampling algorithm to use for image resizing. Default is bilinear. Please refer</span></span>
-<span id="cb1-1426"><a href="#cb1-1426" aria-hidden="true" tabindex="-1"></a><span class="co"># to PIL.Image.Resampling for more details.</span></span>
-<span id="cb1-1427"><a href="#cb1-1427" aria-hidden="true" tabindex="-1"></a><span class="fu">image_resize_algorithm</span><span class="kw">:</span><span class="at"> Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None</span></span>
-<span id="cb1-1428"><a href="#cb1-1428" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1429"><a href="#cb1-1429" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the base model configuration</span></span>
-<span id="cb1-1430"><a href="#cb1-1430" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1431"><a href="#cb1-1431" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides the base model loading from_pretrained</span></span>
-<span id="cb1-1432"><a href="#cb1-1432" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1433"><a href="#cb1-1433" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to specify the type of model to load, AutoModelForCausalLM is a good</span></span>
-<span id="cb1-1434"><a href="#cb1-1434" aria-hidden="true" tabindex="-1"></a><span class="co"># choice too</span></span>
-<span id="cb1-1435"><a href="#cb1-1435" aria-hidden="true" tabindex="-1"></a><span class="fu">type_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1436"><a href="#cb1-1436" aria-hidden="true" tabindex="-1"></a><span class="co"># You can specify to choose a specific model revision from huggingface hub</span></span>
-<span id="cb1-1437"><a href="#cb1-1437" aria-hidden="true" tabindex="-1"></a><span class="fu">revision_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1438"><a href="#cb1-1438" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1439"><a href="#cb1-1439" aria-hidden="true" tabindex="-1"></a><span class="fu">max_packed_sequence_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1440"><a href="#cb1-1440" aria-hidden="true" tabindex="-1"></a><span class="fu">rope_scaling</span><span class="kw">:</span><span class="at"> Any | None</span></span>
-<span id="cb1-1441"><a href="#cb1-1441" aria-hidden="true" tabindex="-1"></a><span class="fu">noisy_embedding_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1442"><a href="#cb1-1442" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1443"><a href="#cb1-1443" aria-hidden="true" tabindex="-1"></a><span class="fu">evaluation_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-1340"><a href="#cb1-1340" aria-hidden="true" tabindex="-1"></a><span class="co"># Gradient clipping max norm</span></span>
+<span id="cb1-1341"><a href="#cb1-1341" aria-hidden="true" tabindex="-1"></a><span class="fu">max_grad_norm</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1342"><a href="#cb1-1342" aria-hidden="true" tabindex="-1"></a><span class="fu">num_epochs</span><span class="kw">:</span><span class="at"> float = 1.0</span></span>
+<span id="cb1-1343"><a href="#cb1-1343" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1344"><a href="#cb1-1344" aria-hidden="true" tabindex="-1"></a><span class="fu">use_wandb</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1345"><a href="#cb1-1345" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your wandb run</span></span>
+<span id="cb1-1346"><a href="#cb1-1346" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1347"><a href="#cb1-1347" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the ID of your wandb run</span></span>
+<span id="cb1-1348"><a href="#cb1-1348" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_run_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1349"><a href="#cb1-1349" aria-hidden="true" tabindex="-1"></a><span class="co"># "offline" to save run metadata locally and not sync to the server, "disabled" to turn</span></span>
+<span id="cb1-1350"><a href="#cb1-1350" aria-hidden="true" tabindex="-1"></a><span class="co"># off wandb</span></span>
+<span id="cb1-1351"><a href="#cb1-1351" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1352"><a href="#cb1-1352" aria-hidden="true" tabindex="-1"></a><span class="co"># Your wandb project name</span></span>
+<span id="cb1-1353"><a href="#cb1-1353" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_project</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1354"><a href="#cb1-1354" aria-hidden="true" tabindex="-1"></a><span class="co"># A wandb Team name if using a Team</span></span>
+<span id="cb1-1355"><a href="#cb1-1355" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_entity</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1356"><a href="#cb1-1356" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_watch</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1357"><a href="#cb1-1357" aria-hidden="true" tabindex="-1"></a><span class="co"># "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only</span></span>
+<span id="cb1-1358"><a href="#cb1-1358" aria-hidden="true" tabindex="-1"></a><span class="co"># at the end of training</span></span>
+<span id="cb1-1359"><a href="#cb1-1359" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_log_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1360"><a href="#cb1-1360" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1361"><a href="#cb1-1361" aria-hidden="true" tabindex="-1"></a><span class="fu">use_mlflow</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1362"><a href="#cb1-1362" aria-hidden="true" tabindex="-1"></a><span class="co"># URI to mlflow</span></span>
+<span id="cb1-1363"><a href="#cb1-1363" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_tracking_uri</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1364"><a href="#cb1-1364" aria-hidden="true" tabindex="-1"></a><span class="co"># Your experiment name</span></span>
+<span id="cb1-1365"><a href="#cb1-1365" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_experiment_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1366"><a href="#cb1-1366" aria-hidden="true" tabindex="-1"></a><span class="co"># Your run name</span></span>
+<span id="cb1-1367"><a href="#cb1-1367" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1368"><a href="#cb1-1368" aria-hidden="true" tabindex="-1"></a><span class="co"># set to true to copy each saved checkpoint on each save to mlflow artifact registry</span></span>
+<span id="cb1-1369"><a href="#cb1-1369" aria-hidden="true" tabindex="-1"></a><span class="fu">hf_mlflow_log_artifacts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1370"><a href="#cb1-1370" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1371"><a href="#cb1-1371" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable or disable Comet integration.</span></span>
+<span id="cb1-1372"><a href="#cb1-1372" aria-hidden="true" tabindex="-1"></a><span class="fu">use_comet</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1373"><a href="#cb1-1373" aria-hidden="true" tabindex="-1"></a><span class="co"># API key for Comet. Recommended to set via `comet login`.</span></span>
+<span id="cb1-1374"><a href="#cb1-1374" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_api_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1375"><a href="#cb1-1375" aria-hidden="true" tabindex="-1"></a><span class="co"># Workspace name in Comet. Defaults to the user's default workspace.</span></span>
+<span id="cb1-1376"><a href="#cb1-1376" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_workspace</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1377"><a href="#cb1-1377" aria-hidden="true" tabindex="-1"></a><span class="co"># Project name in Comet. Defaults to Uncategorized.</span></span>
+<span id="cb1-1378"><a href="#cb1-1378" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1379"><a href="#cb1-1379" aria-hidden="true" tabindex="-1"></a><span class="co"># Identifier for the experiment. Used to append data to an existing experiment or</span></span>
+<span id="cb1-1380"><a href="#cb1-1380" aria-hidden="true" tabindex="-1"></a><span class="co"># control the key of new experiments. Default to a random key.</span></span>
+<span id="cb1-1381"><a href="#cb1-1381" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1382"><a href="#cb1-1382" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a new experiment ("create") or log to an existing one ("get"). Default</span></span>
+<span id="cb1-1383"><a href="#cb1-1383" aria-hidden="true" tabindex="-1"></a><span class="co"># ("get_or_create") auto-selects based on configuration.</span></span>
+<span id="cb1-1384"><a href="#cb1-1384" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1385"><a href="#cb1-1385" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to True to log data to Comet server, or False for offline storage. Default is</span></span>
+<span id="cb1-1386"><a href="#cb1-1386" aria-hidden="true" tabindex="-1"></a><span class="co"># True.</span></span>
+<span id="cb1-1387"><a href="#cb1-1387" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_online</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1388"><a href="#cb1-1388" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary for additional configuration settings, see the doc for more details.</span></span>
+<span id="cb1-1389"><a href="#cb1-1389" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1390"><a href="#cb1-1390" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1391"><a href="#cb1-1391" aria-hidden="true" tabindex="-1"></a><span class="fu">use_trackio</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1392"><a href="#cb1-1392" aria-hidden="true" tabindex="-1"></a><span class="co"># Your trackio project name</span></span>
+<span id="cb1-1393"><a href="#cb1-1393" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1394"><a href="#cb1-1394" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your trackio run</span></span>
+<span id="cb1-1395"><a href="#cb1-1395" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1396"><a href="#cb1-1396" aria-hidden="true" tabindex="-1"></a><span class="co"># Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)</span></span>
+<span id="cb1-1397"><a href="#cb1-1397" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_space_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1398"><a href="#cb1-1398" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1399"><a href="#cb1-1399" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable OpenTelemetry metrics collection and Prometheus export</span></span>
+<span id="cb1-1400"><a href="#cb1-1400" aria-hidden="true" tabindex="-1"></a><span class="fu">use_otel_metrics</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1401"><a href="#cb1-1401" aria-hidden="true" tabindex="-1"></a><span class="co"># Host to bind the OpenTelemetry metrics server to</span></span>
+<span id="cb1-1402"><a href="#cb1-1402" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_host</span><span class="kw">:</span><span class="at"> str | None = localhost</span></span>
+<span id="cb1-1403"><a href="#cb1-1403" aria-hidden="true" tabindex="-1"></a><span class="co"># Port for the Prometheus metrics HTTP server</span></span>
+<span id="cb1-1404"><a href="#cb1-1404" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_port</span><span class="kw">:</span><span class="at"> int | None = 8000</span></span>
+<span id="cb1-1405"><a href="#cb1-1405" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1406"><a href="#cb1-1406" aria-hidden="true" tabindex="-1"></a><span class="co"># the number of activate layers in LISA</span></span>
+<span id="cb1-1407"><a href="#cb1-1407" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_n_layers</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1408"><a href="#cb1-1408" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to switch layers in LISA</span></span>
+<span id="cb1-1409"><a href="#cb1-1409" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_step_interval</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1410"><a href="#cb1-1410" aria-hidden="true" tabindex="-1"></a><span class="co"># path under the model to access the layers</span></span>
+<span id="cb1-1411"><a href="#cb1-1411" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_layers_attribute</span><span class="kw">:</span><span class="at"> str | None = model.layers</span></span>
+<span id="cb1-1412"><a href="#cb1-1412" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1413"><a href="#cb1-1413" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_title</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1414"><a href="#cb1-1414" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_share</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1415"><a href="#cb1-1415" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1416"><a href="#cb1-1416" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_port</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1417"><a href="#cb1-1417" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1418"><a href="#cb1-1418" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1419"><a href="#cb1-1419" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1420"><a href="#cb1-1420" aria-hidden="true" tabindex="-1"></a><span class="fu">use_ray</span><span class="kw">:</span><span class="at"> bool = False</span></span>
+<span id="cb1-1421"><a href="#cb1-1421" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1422"><a href="#cb1-1422" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_num_workers</span><span class="kw">:</span><span class="at"> int = 1</span></span>
+<span id="cb1-1423"><a href="#cb1-1423" aria-hidden="true" tabindex="-1"></a><span class="fu">resources_per_worker</span><span class="kw">:</span><span class="at"> dict</span></span>
+<span id="cb1-1424"><a href="#cb1-1424" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1425"><a href="#cb1-1425" aria-hidden="true" tabindex="-1"></a><span class="co"># The size of the image to resize to. It can be an integer (resized into padded-square</span></span>
+<span id="cb1-1426"><a href="#cb1-1426" aria-hidden="true" tabindex="-1"></a><span class="co"># image) or a tuple (width, height).If not provided, we will attempt to load from</span></span>
+<span id="cb1-1427"><a href="#cb1-1427" aria-hidden="true" tabindex="-1"></a><span class="co"># preprocessor.size, otherwise, images won't be resized.</span></span>
+<span id="cb1-1428"><a href="#cb1-1428" aria-hidden="true" tabindex="-1"></a><span class="fu">image_size</span><span class="kw">:</span><span class="at"> int | tuple[int, int] | None</span></span>
+<span id="cb1-1429"><a href="#cb1-1429" aria-hidden="true" tabindex="-1"></a><span class="co"># The resampling algorithm to use for image resizing. Default is bilinear. Please refer</span></span>
+<span id="cb1-1430"><a href="#cb1-1430" aria-hidden="true" tabindex="-1"></a><span class="co"># to PIL.Image.Resampling for more details.</span></span>
+<span id="cb1-1431"><a href="#cb1-1431" aria-hidden="true" tabindex="-1"></a><span class="fu">image_resize_algorithm</span><span class="kw">:</span><span class="at"> Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None</span></span>
+<span id="cb1-1432"><a href="#cb1-1432" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1433"><a href="#cb1-1433" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the base model configuration</span></span>
+<span id="cb1-1434"><a href="#cb1-1434" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1435"><a href="#cb1-1435" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides the base model loading from_pretrained</span></span>
+<span id="cb1-1436"><a href="#cb1-1436" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1437"><a href="#cb1-1437" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to specify the type of model to load, AutoModelForCausalLM is a good</span></span>
+<span id="cb1-1438"><a href="#cb1-1438" aria-hidden="true" tabindex="-1"></a><span class="co"># choice too</span></span>
+<span id="cb1-1439"><a href="#cb1-1439" aria-hidden="true" tabindex="-1"></a><span class="fu">type_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1440"><a href="#cb1-1440" aria-hidden="true" tabindex="-1"></a><span class="co"># You can specify to choose a specific model revision from huggingface hub</span></span>
+<span id="cb1-1441"><a href="#cb1-1441" aria-hidden="true" tabindex="-1"></a><span class="fu">revision_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1442"><a href="#cb1-1442" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1443"><a href="#cb1-1443" aria-hidden="true" tabindex="-1"></a><span class="fu">max_packed_sequence_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1444"><a href="#cb1-1444" aria-hidden="true" tabindex="-1"></a><span class="fu">rope_scaling</span><span class="kw">:</span><span class="at"> Any | None</span></span>
+<span id="cb1-1445"><a href="#cb1-1445" aria-hidden="true" tabindex="-1"></a><span class="fu">noisy_embedding_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1446"><a href="#cb1-1446" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1447"><a href="#cb1-1447" aria-hidden="true" tabindex="-1"></a><span class="fu">evaluation_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 
 
 
diff --git a/docs/custom_integrations.html b/docs/custom_integrations.html
index bdf314c31..78f1e15df 100644
--- a/docs/custom_integrations.html
+++ b/docs/custom_integrations.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
@@ -938,7 +944,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <ul>
 <li>If you are installing from pip</li>
 </ul>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> uninstall <span class="at">-y</span> cut-cross-entropy <span class="kw">&amp;&amp;</span> <span class="ex">pip3</span> install <span class="st">"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f4b5712"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> uninstall <span class="at">-y</span> cut-cross-entropy <span class="kw">&amp;&amp;</span> <span class="ex">pip3</span> install <span class="st">"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
 <section id="usage" class="level3">
 <h3 class="anchored" data-anchor-id="usage">Usage</h3>
@@ -971,8 +977,8 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <li>gpt_oss</li>
 <li>granite</li>
 <li>granitemoe</li>
-<li>granitemoeshared</li>
 <li>granitemoehybrid</li>
+<li>granitemoeshared</li>
 <li>hunyuan_v1_dense</li>
 <li>hunyuan_v1_moe</li>
 <li>internvl</li>
@@ -997,16 +1003,17 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <li>phi3</li>
 <li>phi4_multimodal</li>
 <li>qwen2</li>
-<li>qwen2_vl</li>
 <li>qwen2_moe</li>
+<li>qwen2_vl</li>
 <li>qwen2_5_vl</li>
 <li>qwen3</li>
 <li>qwen3_moe</li>
+<li>qwen3_next</li>
 <li>qwen3_vl</li>
 <li>qwen3_vl_moe</li>
-<li>qwen3_next</li>
-<li>smollm3</li>
 <li>seed_oss</li>
+<li>smollm3</li>
+<li>step3p5</li>
 <li>voxtral</li>
 </ul>
 </section>
diff --git a/docs/dataset-formats/conversation.html b/docs/dataset-formats/conversation.html
index 2f98da1e2..8660e0314 100644
--- a/docs/dataset-formats/conversation.html
+++ b/docs/dataset-formats/conversation.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/index.html b/docs/dataset-formats/index.html
index 1f280679c..4595400b8 100644
--- a/docs/dataset-formats/index.html
+++ b/docs/dataset-formats/index.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/inst_tune.html b/docs/dataset-formats/inst_tune.html
index 21ae7700d..76004934d 100644
--- a/docs/dataset-formats/inst_tune.html
+++ b/docs/dataset-formats/inst_tune.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/pretraining.html b/docs/dataset-formats/pretraining.html
index bce5ed7a7..8f3654b5e 100644
--- a/docs/dataset-formats/pretraining.html
+++ b/docs/dataset-formats/pretraining.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/stepwise_supervised.html b/docs/dataset-formats/stepwise_supervised.html
index 28b8a5405..e9549ef00 100644
--- a/docs/dataset-formats/stepwise_supervised.html
+++ b/docs/dataset-formats/stepwise_supervised.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/template_free.html b/docs/dataset-formats/template_free.html
index 04a1adfb7..6e18249f7 100644
--- a/docs/dataset-formats/template_free.html
+++ b/docs/dataset-formats/template_free.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/tokenized.html b/docs/dataset-formats/tokenized.html
index 7115c8f52..8b69f9e53 100644
--- a/docs/dataset-formats/tokenized.html
+++ b/docs/dataset-formats/tokenized.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset_loading.html b/docs/dataset_loading.html
index c25935abf..e0e858bc0 100644
--- a/docs/dataset_loading.html
+++ b/docs/dataset_loading.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset_preprocessing.html b/docs/dataset_preprocessing.html
index 694518bdc..3439abe32 100644
--- a/docs/dataset_preprocessing.html
+++ b/docs/dataset_preprocessing.html
@@ -613,6 +613,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/debugging.html b/docs/debugging.html
index 71c3f8d16..9b58663bf 100644
--- a/docs/debugging.html
+++ b/docs/debugging.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/docker.html b/docs/docker.html
index 933c382dc..4fb86ec79 100644
--- a/docs/docker.html
+++ b/docs/docker.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/faq.html b/docs/faq.html
index 7368ee5d6..9915b70d9 100644
--- a/docs/faq.html
+++ b/docs/faq.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/fsdp_qlora.html b/docs/fsdp_qlora.html
index 828dd4679..fef23bfd6 100644
--- a/docs/fsdp_qlora.html
+++ b/docs/fsdp_qlora.html
@@ -613,6 +613,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/getting-started.html b/docs/getting-started.html
index dc5fee610..938576f5d 100644
--- a/docs/getting-started.html
+++ b/docs/getting-started.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/gradient_checkpointing.html b/docs/gradient_checkpointing.html
index f70360288..facf2627f 100644
--- a/docs/gradient_checkpointing.html
+++ b/docs/gradient_checkpointing.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/inference.html b/docs/inference.html
index ea13f7aff..0a1ced33b 100644
--- a/docs/inference.html
+++ b/docs/inference.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/input_output.html b/docs/input_output.html
index 3f9f533b0..dc128dc91 100644
--- a/docs/input_output.html
+++ b/docs/input_output.html
@@ -613,6 +613,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/installation.html b/docs/installation.html
index ce77a5556..887346fd7 100644
--- a/docs/installation.html
+++ b/docs/installation.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/lora_optims.html b/docs/lora_optims.html
index bc2ef7839..3a2075325 100644
--- a/docs/lora_optims.html
+++ b/docs/lora_optims.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
@@ -869,6 +875,19 @@ Note
 <p>Currently, LoRA kernels are not supported for RLHF training, only SFT.</p>
 </div>
 </div>
+<div class="callout callout-style-default callout-warning callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Warning
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>LoRA kernels do not support remote modeling code.</p>
+</div>
+</div>
 </section>
 <section id="requirements" class="level2">
 <h2 class="anchored" data-anchor-id="requirements">Requirements</h2>
diff --git a/docs/lr_groups.html b/docs/lr_groups.html
index 7388e1678..188189931 100644
--- a/docs/lr_groups.html
+++ b/docs/lr_groups.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/mac.html b/docs/mac.html
index aebdbe596..059d613b4 100644
--- a/docs/mac.html
+++ b/docs/mac.html
@@ -613,6 +613,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/mixed_precision.html b/docs/mixed_precision.html
index 7dfd39772..3dcb91633 100644
--- a/docs/mixed_precision.html
+++ b/docs/mixed_precision.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/LiquidAI.html b/docs/models/LiquidAI.html
index 3e77e235e..88ca4e0f1 100644
--- a/docs/models/LiquidAI.html
+++ b/docs/models/LiquidAI.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/apertus.html b/docs/models/apertus.html
index ae09e9d64..4b1d24568 100644
--- a/docs/models/apertus.html
+++ b/docs/models/apertus.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/arcee.html b/docs/models/arcee.html
index 313a8bd17..d1a38375a 100644
--- a/docs/models/arcee.html
+++ b/docs/models/arcee.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/devstral.html b/docs/models/devstral.html
index fd3c2dd45..b28eb62cc 100644
--- a/docs/models/devstral.html
+++ b/docs/models/devstral.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/gemma3n.html b/docs/models/gemma3n.html
index b0992814e..c664730c8 100644
--- a/docs/models/gemma3n.html
+++ b/docs/models/gemma3n.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/gpt-oss.html b/docs/models/gpt-oss.html
index 649c1b390..11f603268 100644
--- a/docs/models/gpt-oss.html
+++ b/docs/models/gpt-oss.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/granite4.html b/docs/models/granite4.html
index ca1aa8014..bb582e194 100644
--- a/docs/models/granite4.html
+++ b/docs/models/granite4.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/hunyuan.html b/docs/models/hunyuan.html
index 3e6bf9390..79667dc76 100644
--- a/docs/models/hunyuan.html
+++ b/docs/models/hunyuan.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/index.html b/docs/models/index.html
index d5269f49e..cedaaef78 100644
--- a/docs/models/index.html
+++ b/docs/models/index.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/internvl3_5.html b/docs/models/internvl3_5.html
index 8cb5c097a..1104dcd8d 100644
--- a/docs/models/internvl3_5.html
+++ b/docs/models/internvl3_5.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/jamba.html b/docs/models/jamba.html
index cac9ca184..b934bb5cb 100644
--- a/docs/models/jamba.html
+++ b/docs/models/jamba.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/kimi-linear.html b/docs/models/kimi-linear.html
index 823f0b2da..909f289fa 100644
--- a/docs/models/kimi-linear.html
+++ b/docs/models/kimi-linear.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/llama-2.html b/docs/models/llama-2.html
index 82e2aa8cb..bf3a06e63 100644
--- a/docs/models/llama-2.html
+++ b/docs/models/llama-2.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/llama-4.html b/docs/models/llama-4.html
index 9542c792c..d9b09a475 100644
--- a/docs/models/llama-4.html
+++ b/docs/models/llama-4.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/magistral.html b/docs/models/magistral.html
index 0bd0cac9b..2323c2fa9 100644
--- a/docs/models/magistral.html
+++ b/docs/models/magistral.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/magistral/think.html b/docs/models/magistral/think.html
index fa182b29f..a28c61e6c 100644
--- a/docs/models/magistral/think.html
+++ b/docs/models/magistral/think.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/magistral/vision.html b/docs/models/magistral/vision.html
index 85f9ab183..26fe9a4d6 100644
--- a/docs/models/magistral/vision.html
+++ b/docs/models/magistral/vision.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/mimo.html b/docs/models/mimo.html
index 5fb7641b9..e423e26cf 100644
--- a/docs/models/mimo.html
+++ b/docs/models/mimo.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/ministral.html b/docs/models/ministral.html
index ee6494974..de23e3b08 100644
--- a/docs/models/ministral.html
+++ b/docs/models/ministral.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/ministral3.html b/docs/models/ministral3.html
index 207ea29ac..aea9061a9 100644
--- a/docs/models/ministral3.html
+++ b/docs/models/ministral3.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/ministral3/think.html b/docs/models/ministral3/think.html
index 8b1146fae..8acafa144 100644
--- a/docs/models/ministral3/think.html
+++ b/docs/models/ministral3/think.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/ministral3/vision.html b/docs/models/ministral3/vision.html
index e0c8561df..4983a5425 100644
--- a/docs/models/ministral3/vision.html
+++ b/docs/models/ministral3/vision.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/mistral-small.html b/docs/models/mistral-small.html
index 707071f18..48a10c15b 100644
--- a/docs/models/mistral-small.html
+++ b/docs/models/mistral-small.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/mistral.html b/docs/models/mistral.html
index 03dde3530..c24926f9e 100644
--- a/docs/models/mistral.html
+++ b/docs/models/mistral.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/olmo3.html b/docs/models/olmo3.html
index dfcd74fd5..03915e483 100644
--- a/docs/models/olmo3.html
+++ b/docs/models/olmo3.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/orpheus.html b/docs/models/orpheus.html
index 012f55411..952a67788 100644
--- a/docs/models/orpheus.html
+++ b/docs/models/orpheus.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/phi.html b/docs/models/phi.html
index bce6ee05e..b210a2475 100644
--- a/docs/models/phi.html
+++ b/docs/models/phi.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/plano.html b/docs/models/plano.html
index b6a376d00..ac1a19190 100644
--- a/docs/models/plano.html
+++ b/docs/models/plano.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/qwen3-next.html b/docs/models/qwen3-next.html
index d6fc60f1a..cdc9c2052 100644
--- a/docs/models/qwen3-next.html
+++ b/docs/models/qwen3-next.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/qwen3.html b/docs/models/qwen3.html
index be18a1e0d..33ed12a33 100644
--- a/docs/models/qwen3.html
+++ b/docs/models/qwen3.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/seed-oss.html b/docs/models/seed-oss.html
index cedce1df3..a21978db7 100644
--- a/docs/models/seed-oss.html
+++ b/docs/models/seed-oss.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/smolvlm2.html b/docs/models/smolvlm2.html
index 5cae7dae9..6c9f4c87e 100644
--- a/docs/models/smolvlm2.html
+++ b/docs/models/smolvlm2.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/trinity.html b/docs/models/trinity.html
index 332e9392a..2cd908a46 100644
--- a/docs/models/trinity.html
+++ b/docs/models/trinity.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/models/voxtral.html b/docs/models/voxtral.html
index b66c0f754..78ffa5bda 100644
--- a/docs/models/voxtral.html
+++ b/docs/models/voxtral.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/multi-gpu.html b/docs/multi-gpu.html
index 1ab976962..11da123cf 100644
--- a/docs/multi-gpu.html
+++ b/docs/multi-gpu.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/multi-node.html b/docs/multi-node.html
index a66fb89f7..3ba45bcd3 100644
--- a/docs/multi-node.html
+++ b/docs/multi-node.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/multimodal.html b/docs/multimodal.html
index 09338f7d4..56b1852a7 100644
--- a/docs/multimodal.html
+++ b/docs/multimodal.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
@@ -757,6 +763,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <li><a href="#sec-qwen2-vl" id="toc-sec-qwen2-vl" class="nav-link" data-scroll-target="#sec-qwen2-vl">Qwen2-VL</a></li>
   <li><a href="#sec-qwen25-vl" id="toc-sec-qwen25-vl" class="nav-link" data-scroll-target="#sec-qwen25-vl">Qwen2.5-VL</a></li>
   <li><a href="#sec-qwen3-vl" id="toc-sec-qwen3-vl" class="nav-link" data-scroll-target="#sec-qwen3-vl">Qwen3-VL</a></li>
+  <li><a href="#sec-glm-4-6v" id="toc-sec-glm-4-6v" class="nav-link" data-scroll-target="#sec-glm-4-6v">GLM-4.6V</a></li>
   <li><a href="#sec-smolvlm2" id="toc-sec-smolvlm2" class="nav-link" data-scroll-target="#sec-smolvlm2">SmolVLM2</a></li>
   <li><a href="#sec-lfm2-vl" id="toc-sec-lfm2-vl" class="nav-link" data-scroll-target="#sec-lfm2-vl">LFM2-VL</a></li>
   <li><a href="#sec-intern-vl" id="toc-sec-intern-vl" class="nav-link" data-scroll-target="#sec-intern-vl">Intern-VL</a></li>
@@ -808,6 +815,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <li><a href="#sec-gemma-3n">Gemma-3n</a></li>
 <li><a href="#sec-qwen2-vl">Qwen2-VL</a></li>
 <li><a href="#sec-qwen25-vl">Qwen2.5-VL</a></li>
+<li><a href="#sec-glm-4-6v">GLM-4.6V</a></li>
 <li><a href="#sec-smolvlm2">SmolVLM2</a></li>
 <li><a href="#sec-lfm2-vl">LFM2-VL</a></li>
 <li><a href="#sec-intern-vl">Intern-VL</a></li>
@@ -1014,6 +1022,15 @@ Tip
 <span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> qwen2_vl</span><span class="co">  # same as qwen2-vl</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
+<section id="sec-glm-4-6v" class="level3">
+<h3 class="anchored" data-anchor-id="sec-glm-4-6v">GLM-4.6V</h3>
+<p>Both GLM-4.6V (106B MoE) and GLM-4.6V-Flash (9B) are supported.</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb14"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co"># GLM-4.6V (106B MoE version)</span></span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> zai-org/GLM-4.6V</span></span>
+<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="co"># OR GLM-4.6V-Flash (9B version)</span></span>
+<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> zai-org/GLM-4.6V-Flash</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+</section>
 <section id="sec-smolvlm2" class="level3">
 <h3 class="anchored" data-anchor-id="sec-smolvlm2">SmolVLM2</h3>
 <div class="callout callout-style-default callout-tip callout-titled">
@@ -1029,7 +1046,7 @@ Tip
 <p>Please make sure to install <code>num2words</code> via <code>pip3 install num2words==0.5.14</code></p>
 </div>
 </div>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb14"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> HuggingFaceTB/SmolVLM2-500M-Video-Instruct</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb15"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> HuggingFaceTB/SmolVLM2-500M-Video-Instruct</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
 <section id="sec-lfm2-vl" class="level3">
 <h3 class="anchored" data-anchor-id="sec-lfm2-vl">LFM2-VL</h3>
@@ -1046,7 +1063,7 @@ Warning
 <p>Please uninstall <code>causal-conv1d</code> via <code>pip3 uninstall -y causal-conv1d</code></p>
 </div>
 </div>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb15"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> LiquidAI/LFM2-VL-450M</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb16"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> LiquidAI/LFM2-VL-450M</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
 <section id="sec-intern-vl" class="level3">
 <h3 class="anchored" data-anchor-id="sec-intern-vl">Intern-VL</h3>
@@ -1063,7 +1080,7 @@ Tip
 <p>Please make sure to install <code>timm</code> via <code>pip3 install timm==1.0.19</code></p>
 </div>
 </div>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb16"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> OpenGVLab/InternVL3_5-8B</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb17"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> OpenGVLab/InternVL3_5-8B</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
 </section>
 <section id="dataset-format" class="level2">
@@ -1148,31 +1165,31 @@ Warning
 <section id="example" class="level3">
 <h3 class="anchored" data-anchor-id="example">Example</h3>
 <p>Here is an example of a multi-modal dataset:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb17"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="ot">[</span></span>
-<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">{</span></span>
-<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a>    <span class="dt">"messages"</span><span class="fu">:</span> <span class="ot">[</span></span>
-<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a>        <span class="fu">{</span></span>
-<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"system"</span><span class="fu">,</span></span>
-<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"content"</span><span class="fu">:</span> <span class="ot">[</span></span>
-<span id="cb17-7"><a href="#cb17-7" aria-hidden="true" tabindex="-1"></a>              <span class="fu">{</span><span class="dt">"type"</span><span class="fu">:</span> <span class="st">"text"</span><span class="fu">,</span> <span class="dt">"text"</span><span class="fu">:</span> <span class="st">"You are a helpful assistant."</span><span class="fu">}</span></span>
-<span id="cb17-8"><a href="#cb17-8" aria-hidden="true" tabindex="-1"></a>              <span class="ot">]</span></span>
-<span id="cb17-9"><a href="#cb17-9" aria-hidden="true" tabindex="-1"></a>        <span class="fu">}</span><span class="ot">,</span></span>
-<span id="cb17-10"><a href="#cb17-10" aria-hidden="true" tabindex="-1"></a>        <span class="fu">{</span></span>
-<span id="cb17-11"><a href="#cb17-11" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span></span>
-<span id="cb17-12"><a href="#cb17-12" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"content"</span><span class="fu">:</span> <span class="ot">[</span></span>
-<span id="cb17-13"><a href="#cb17-13" aria-hidden="true" tabindex="-1"></a>                <span class="fu">{</span><span class="dt">"type"</span><span class="fu">:</span> <span class="st">"image"</span><span class="fu">,</span> <span class="dt">"url"</span><span class="fu">:</span> <span class="st">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"</span><span class="fu">}</span><span class="ot">,</span></span>
-<span id="cb17-14"><a href="#cb17-14" aria-hidden="true" tabindex="-1"></a>                <span class="fu">{</span><span class="dt">"type"</span><span class="fu">:</span> <span class="st">"text"</span><span class="fu">,</span> <span class="dt">"text"</span><span class="fu">:</span> <span class="st">"Describe this image in detail."</span><span class="fu">}</span></span>
-<span id="cb17-15"><a href="#cb17-15" aria-hidden="true" tabindex="-1"></a>            <span class="ot">]</span></span>
-<span id="cb17-16"><a href="#cb17-16" aria-hidden="true" tabindex="-1"></a>        <span class="fu">}</span><span class="ot">,</span></span>
-<span id="cb17-17"><a href="#cb17-17" aria-hidden="true" tabindex="-1"></a>        <span class="fu">{</span></span>
-<span id="cb17-18"><a href="#cb17-18" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span></span>
-<span id="cb17-19"><a href="#cb17-19" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"content"</span><span class="fu">:</span> <span class="ot">[</span></span>
-<span id="cb17-20"><a href="#cb17-20" aria-hidden="true" tabindex="-1"></a>              <span class="fu">{</span><span class="dt">"type"</span><span class="fu">:</span> <span class="st">"text"</span><span class="fu">,</span> <span class="dt">"text"</span><span class="fu">:</span> <span class="st">"The image is a bee."</span><span class="fu">}</span></span>
-<span id="cb17-21"><a href="#cb17-21" aria-hidden="true" tabindex="-1"></a>            <span class="ot">]</span></span>
-<span id="cb17-22"><a href="#cb17-22" aria-hidden="true" tabindex="-1"></a>        <span class="fu">}</span></span>
-<span id="cb17-23"><a href="#cb17-23" aria-hidden="true" tabindex="-1"></a>    <span class="ot">]</span></span>
-<span id="cb17-24"><a href="#cb17-24" aria-hidden="true" tabindex="-1"></a>  <span class="fu">}</span></span>
-<span id="cb17-25"><a href="#cb17-25" aria-hidden="true" tabindex="-1"></a><span class="ot">]</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb18"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="ot">[</span></span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">{</span></span>
+<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a>    <span class="dt">"messages"</span><span class="fu">:</span> <span class="ot">[</span></span>
+<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a>        <span class="fu">{</span></span>
+<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"system"</span><span class="fu">,</span></span>
+<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"content"</span><span class="fu">:</span> <span class="ot">[</span></span>
+<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a>              <span class="fu">{</span><span class="dt">"type"</span><span class="fu">:</span> <span class="st">"text"</span><span class="fu">,</span> <span class="dt">"text"</span><span class="fu">:</span> <span class="st">"You are a helpful assistant."</span><span class="fu">}</span></span>
+<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a>              <span class="ot">]</span></span>
+<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a>        <span class="fu">}</span><span class="ot">,</span></span>
+<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a>        <span class="fu">{</span></span>
+<span id="cb18-11"><a href="#cb18-11" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span></span>
+<span id="cb18-12"><a href="#cb18-12" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"content"</span><span class="fu">:</span> <span class="ot">[</span></span>
+<span id="cb18-13"><a href="#cb18-13" aria-hidden="true" tabindex="-1"></a>                <span class="fu">{</span><span class="dt">"type"</span><span class="fu">:</span> <span class="st">"image"</span><span class="fu">,</span> <span class="dt">"url"</span><span class="fu">:</span> <span class="st">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"</span><span class="fu">}</span><span class="ot">,</span></span>
+<span id="cb18-14"><a href="#cb18-14" aria-hidden="true" tabindex="-1"></a>                <span class="fu">{</span><span class="dt">"type"</span><span class="fu">:</span> <span class="st">"text"</span><span class="fu">,</span> <span class="dt">"text"</span><span class="fu">:</span> <span class="st">"Describe this image in detail."</span><span class="fu">}</span></span>
+<span id="cb18-15"><a href="#cb18-15" aria-hidden="true" tabindex="-1"></a>            <span class="ot">]</span></span>
+<span id="cb18-16"><a href="#cb18-16" aria-hidden="true" tabindex="-1"></a>        <span class="fu">}</span><span class="ot">,</span></span>
+<span id="cb18-17"><a href="#cb18-17" aria-hidden="true" tabindex="-1"></a>        <span class="fu">{</span></span>
+<span id="cb18-18"><a href="#cb18-18" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span></span>
+<span id="cb18-19"><a href="#cb18-19" aria-hidden="true" tabindex="-1"></a>            <span class="dt">"content"</span><span class="fu">:</span> <span class="ot">[</span></span>
+<span id="cb18-20"><a href="#cb18-20" aria-hidden="true" tabindex="-1"></a>              <span class="fu">{</span><span class="dt">"type"</span><span class="fu">:</span> <span class="st">"text"</span><span class="fu">,</span> <span class="dt">"text"</span><span class="fu">:</span> <span class="st">"The image is a bee."</span><span class="fu">}</span></span>
+<span id="cb18-21"><a href="#cb18-21" aria-hidden="true" tabindex="-1"></a>            <span class="ot">]</span></span>
+<span id="cb18-22"><a href="#cb18-22" aria-hidden="true" tabindex="-1"></a>        <span class="fu">}</span></span>
+<span id="cb18-23"><a href="#cb18-23" aria-hidden="true" tabindex="-1"></a>    <span class="ot">]</span></span>
+<span id="cb18-24"><a href="#cb18-24" aria-hidden="true" tabindex="-1"></a>  <span class="fu">}</span></span>
+<span id="cb18-25"><a href="#cb18-25" aria-hidden="true" tabindex="-1"></a><span class="ot">]</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
 </section>
 <section id="faq" class="level2">
diff --git a/docs/multipack.html b/docs/multipack.html
index 46dda16fd..fdaaea814 100644
--- a/docs/multipack.html
+++ b/docs/multipack.html
@@ -613,6 +613,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/nccl.html b/docs/nccl.html
index 9d3f75ba5..226baa30e 100644
--- a/docs/nccl.html
+++ b/docs/nccl.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/nd_parallelism.html b/docs/nd_parallelism.html
index 09d0a7bd2..58ac8179a 100644
--- a/docs/nd_parallelism.html
+++ b/docs/nd_parallelism.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/optimizations.html b/docs/optimizations.html
index 5679f8f93..410c1087b 100644
--- a/docs/optimizations.html
+++ b/docs/optimizations.html
@@ -613,6 +613,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/optimizers.html b/docs/optimizers.html
index 6b6dc6755..576ea76f5 100644
--- a/docs/optimizers.html
+++ b/docs/optimizers.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link active">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/qat.html b/docs/qat.html
index 3a3c8b882..929bef51e 100644
--- a/docs/qat.html
+++ b/docs/qat.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/quantize.html b/docs/quantize.html
index 744e903b1..3514d0d6c 100644
--- a/docs/quantize.html
+++ b/docs/quantize.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/ray-integration.html b/docs/ray-integration.html
index f7030309f..0536ca0f2 100644
--- a/docs/ray-integration.html
+++ b/docs/ray-integration.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/reward_modelling.html b/docs/reward_modelling.html
index b6353c91e..543d718f0 100644
--- a/docs/reward_modelling.html
+++ b/docs/reward_modelling.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/rlhf.html b/docs/rlhf.html
index 987e9200e..fe841f070 100644
--- a/docs/rlhf.html
+++ b/docs/rlhf.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/sequence_parallelism.html b/docs/sequence_parallelism.html
index a73aacf6e..315b924e4 100644
--- a/docs/sequence_parallelism.html
+++ b/docs/sequence_parallelism.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/streaming.html b/docs/streaming.html
index a77514ffc..699162968 100644
--- a/docs/streaming.html
+++ b/docs/streaming.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/telemetry.html b/docs/telemetry.html
index 7b9909b04..2358f7498 100644
--- a/docs/telemetry.html
+++ b/docs/telemetry.html
@@ -613,6 +613,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/torchao.html b/docs/torchao.html
index f28f491ef..8143821ae 100644
--- a/docs/torchao.html
+++ b/docs/torchao.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/unsloth.html b/docs/unsloth.html
index 0b4525243..02b1d0ac2 100644
--- a/docs/unsloth.html
+++ b/docs/unsloth.html
@@ -648,6 +648,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/examples/colab-notebooks/colab-axolotl-example.html b/examples/colab-notebooks/colab-axolotl-example.html
index ae8584b4b..1322307e5 100644
--- a/examples/colab-notebooks/colab-axolotl-example.html
+++ b/examples/colab-notebooks/colab-axolotl-example.html
@@ -651,6 +651,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
@@ -798,7 +804,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <div class="code-copy-outer-scaffold"><div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>capture</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="co"># This step can take ~5-10 minutes to install dependencies</span></span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>pip install <span class="op">--</span>no<span class="op">-</span>build<span class="op">-</span>isolation axolotl[flash<span class="op">-</span>attn]<span class="op">&gt;=</span><span class="fl">0.9.1</span></span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>pip install <span class="st">"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f4b5712"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>pip install <span class="st">"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </div>
 <section id="demo-talk-like-a-pirate" class="level2">
 <h2 class="anchored" data-anchor-id="demo-talk-like-a-pirate">Demo: Talk Like a Pirate</h2>
diff --git a/index.html b/index.html
index 5e026dd3f..f17eda487 100644
--- a/index.html
+++ b/index.html
@@ -647,6 +647,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="./docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/search.json b/search.json
index 6c8883724..9015d8733 100644
--- a/search.json
+++ b/search.json
@@ -506,7 +506,7 @@
     "href": "docs/lora_optims.html#usage",
     "title": "LoRA Optimizations",
     "section": "Usage",
-    "text": "Usage\nThese optimizations can be enabled in your Axolotl config YAML file. The\nlora_mlp_kernel option enables the optimized MLP path, while lora_qkv_kernel and\nlora_o_kernel enable the fused query-key-value projection and optimized output\nprojection, respectively.\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n\n\n\n\n\nNote\n\n\n\nCurrently, LoRA kernels are not supported for RLHF training, only SFT.",
+    "text": "Usage\nThese optimizations can be enabled in your Axolotl config YAML file. The\nlora_mlp_kernel option enables the optimized MLP path, while lora_qkv_kernel and\nlora_o_kernel enable the fused query-key-value projection and optimized output\nprojection, respectively.\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n\n\n\n\n\nNote\n\n\n\nCurrently, LoRA kernels are not supported for RLHF training, only SFT.\n\n\n\n\n\n\n\n\nWarning\n\n\n\nLoRA kernels do not support remote modeling code.",
     "crumbs": [
       "How To Guides",
       "LoRA Optimizations"
@@ -964,7 +964,7 @@
     "href": "docs/config-reference.html",
     "title": "Config Reference",
     "section": "",
-    "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n# Reinitialize model weights randomly instead of loading pretrained weights\nreinit_weights: bool | None\n\n# module to custom trainer class to use for training\ntrainer_cls: str | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl: RLType | None\n\ntrl: TRLConfig | None\n  # For TRLConfig:\n  # Beta parameter for the RL training. Same as `rl_beta`. Use\n  beta: float | None\n  # Maximum length of the completion for RL training.\n  max_completion_length: int | None\n\n  # Whether to use VLLM for RL training.\n  use_vllm: bool = False\n  # VLLM mode to use, one of 'server' or 'colocate'\n  vllm_mode: Literal['server', 'colocate'] | None\n  # Host of the vLLM server to connect to.\n  vllm_server_host: str | None = 0.0.0.0\n  # Port of the vLLM server to connect to.\n  vllm_server_port: int | None = 8000\n  # Total timeout (in seconds) to wait for the vLLM server to respond.\n  vllm_server_timeout: int | None\n  # Regex for vLLM guided decoding.\n  vllm_guided_decoding_regex: str | None\n\n  # List of reward functions to load. Paths must be importable from current dir.\n  reward_funcs: list[str] | None\n  # List of reward weights for the reward functions.\n  reward_weights: list[float] | None\n  # Number of generations to sample.\n  num_generations: int | None\n  # Whether to log completions.\n  log_completions: bool | None = False\n  # Number of completions to print when log_completions is True.\n  num_completions_to_print: int | None\n  # Controls whether importance sampling ratios are computed at the `'token'` or\n  # `'sequence'` level. For GSPO, use `sequence`, default is None which corresponds to\n  # the original GRPO paper.\n  importance_sampling_level: Literal['sequence', 'token'] | None\n\n  # Whether to sync the reference model.\n  sync_ref_model: bool | None = False\n  # Mixup alpha for the reference model.\n  ref_model_mixup_alpha: float | None = 0.9\n  # Sync steps for the reference model.\n  ref_model_sync_steps: int | None = 64\n  # Whether to scale rewards by their standard deviation.\n  scale_rewards: bool = True\n\n  # Sampling temperature for the GRPO policy.\n  temperature: float | None\n  # Top-p sampling probability for the generation policy.\n  top_p: float | None\n  # Top-k sampling for the generation policy.\n  top_k: int | None\n  # Minimum probability for the generation policy.\n  min_p: float | None\n  # Penalty for tokens that appear in prompt and generated text.\n  repetition_penalty: float | None\n  # Number of iterations per batch (μ) for GRPO.\n  num_iterations: int | None\n  # Epsilon value for clipping in the GRPO algorithm.\n  epsilon: float | None\n  # Upper-bound epsilon value for clipping in the GRPO algorithm.\n  epsilon_high: float | None\n  # Whether to use Liger loss for GRPO.\n  use_liger_loss: bool | None\n  # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n  loss_type: str | None\n  # Whether to exclude truncated completions from loss calculation.\n  mask_truncated_completions: bool = False\n  # Enable sleep mode for vLLM to offload VRAM when idle\n  vllm_enable_sleep_mode: bool | None\n  # Path to custom rollout function. Must be importable from current dir.\n  rollout_func: str | None\n  # Multi-objective reward aggregation strategy. 'sum_then_normalize' (GRPO default):\n  # weights and sums rewards first, then normalizes. 'normalize_then_sum' (GDPO):\n  # normalizes each reward independently, then sums.\n  multi_objective_aggregation: Literal['sum_then_normalize', 'normalize_then_sum'] | None\n\nvllm: VllmConfig | None\n  # For VllmConfig:\n  # Device to use for VLLM\n  device: str | None = auto\n  # Tensor parallel size for VLLM\n  tensor_parallel_size: int | None\n  # Data parallel size for VLLM\n  data_parallel_size: int | None\n  # GPU memory utilization for VLLM\n  gpu_memory_utilization: float | None = 0.9\n  # Data type for VLLM\n  dtype: str | None = auto\n  # Maximum length of the model context for VLLM\n  max_model_len: int | None\n  # Enable prefix caching for VLLM\n  enable_prefix_caching: bool | None\n  # Host for the vLLM server to start on\n  host: str | None = 0.0.0.0\n  # Port of the vLLM server to start on\n  port: int | None = 8000\n\n  # Enable reasoning for VLLM\n  enable_reasoning: bool | None\n  # Reasoning parser for VLLM\n  reasoning_parser: str | None\n\nqat: QATConfig | None\n  # For QATConfig:\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Quantize embedding\n  quantize_embedding: bool | None = False\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n  # The number of steps to apply fake quantization after\n  fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n  # For PTQConfig:\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Whether to quantize the embedding layer.\n  quantize_embedding: bool | None\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n\n# Configuration for dynamic checkpointing (trigger by file or signal). Set 'enabled:\n# true' to activate this feature.\ndynamic_checkpoint: DynamicCheckpointConfig | None\n  # For DynamicCheckpointConfig:\n  # Enable dynamic checkpoint triggering during training. Create a file\n  # 'axolotl_checkpoint.save' in the configured `output_dir` to trigger.\n  enabled: bool = False\n  # Check for trigger file every N steps (reduces I/O overhead). Default: 100\n  check_interval: int = 10\n  # Custom trigger filename (optional). If not specified, defaults to\n  # 'axolotl_checkpoint.save'. Specify a filename (not a full path) to override the\n  # default.\n  trigger_file_path: str = \n\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\n# Coefficient to incentivize the reward model to output mean-zero rewards (proposed by\n# https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.\ncenter_rewards_coefficient: float | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_use_logits_to_keep: bool | None\ndpo_label_smoothing: float | None\ndpo_norm_loss: bool | None\n\n# Whether to use Liger kernel for DPO loss.\ndpo_use_liger_kernel: bool | None\n\ndpo_padding_free: bool | None\ndpo_generate_during_eval: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# If true, each dataset in `datasets` will be shuffled before merging. This allows\n# curriculum learning strategies to be applied at the dataset level. Default is false.\nshuffle_before_merging_datasets: bool | None = False\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n# Number of shards to save the prepared dataset\nnum_dataset_shards_to_save: int | None\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n  # For PretrainingDataset:\n  name: str | None\n  path: str | None\n  split: str | None = train\n  text_column: str | None = text\n  type: str | None = pretrain\n  trust_remote_code: bool | None = False\n  data_files: str | None\n  skip: int | None\n\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_processes: int | None\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_num_proc: int | None\n\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Approximate number of predictions sent to wandb depending on batch size. Enabled above\n# 0. Default is 0\neval_table_size: int | None\n# Total number of tokens generated for predictions sent to wandb. Default is 128\neval_max_new_tokens: int | None\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\n# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before\n# evaluations. Default is 0 (disabled).\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require &gt;=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\n# Enable FP8 mixed precision training using TorchAO. Best used in combination with\n# torch.compile.\nfp8: bool | None\n# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training\n# speed by 10-15% when FSDP is enabled.\nfp8_enable_fsdp_float8_all_gather: bool | None\n# No AMP (automatic mixed precision) - require &gt;=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# Use CUDA tf32 - require &gt;=ampere\ntf32: bool | None\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.\nactivation_offloading: Literal['legacy', 'disk'] | bool | None = False\n\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;\n# 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to\n# 'drop' for backward compatibility.\nexcess_length_strategy: Literal['drop', 'truncate', 'raise'] | None\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int | None\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (&lt;%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to\n# True if `sample_packing` enabled\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use streaming mode for loading datasets\nstreaming: bool | None\n# Buffer size for multipack streaming datasets\nstreaming_multipack_buffer_size: int | None = 10000\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n\neager_attention: bool | None\n\n# Specify a custom attention implementation, used mostly for kernels.\nattn_implementation: str | None\n\n# Which experts implementation to use for MoE models,\nexperts_implementation: str | None\n\n# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399\nscaling_softmax: bool | None\n# Scaling factor for SSMax attention. Default is 0.43\nscaling_softmax_factor: float | None\n# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better\n# length generalization.\nscaling_softmax_bias: float | None\n\nunsloth_cross_entropy_loss: bool | None\nunsloth_lora_mlp: bool | None\nunsloth_lora_qkv: bool | None\nunsloth_lora_o: bool | None\nunsloth_rms_norm: bool | None\nunsloth_rope: bool | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n# Enable Entropy-Aware Focal Training loss (EAFT)\nuse_eaft: bool | None\n# Exponent for entropy weighting in EAFT (default: 1.0)\neaft_alpha: float | None = 1.0\n# Number of top logits for entropy approximation (default: 20)\neaft_k: int | None = 20\n\n# Whether to use ALST tiled mlp for memory efficient long context\ntiled_mlp: bool | None\n\n# Number of shards to use for ALST tiled mlp. If unset, it will be set based on\n# seqlen/hidden_size\ntiled_mlp_num_shards: int | None\n\n# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on\n# llama.\ntiled_mlp_use_original_mlp: bool | None = True\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# Whether to use deepcompile for faster training with deepspeed\ndeepcompile: bool | None\n# FSDP configuration\nfsdp: list[str] | None\n\n# FSDP configuration options\nfsdp_config: FSDPConfig | None\n  # For FSDPConfig:\n  # FSDP version\n  fsdp_version: int | None\n  # Enable activation checkpointing to reduce memory usage during forward passes\n  activation_checkpointing: bool | None\n  # Offload parameters to CPU to reduce GPU memory usage\n  offload_params: bool | None\n  # Synchronize module states across all processes\n  sync_module_states: bool | None\n  # Enable CPU RAM efficient loading to reduce memory usage during model loading\n  cpu_ram_efficient_loading: bool | None\n  # Disabling this enables swap memory usage for resource-constrained setups when\n  # offload_params is enabled.\n  cpu_offload_pin_memory: bool | None\n  # Use original parameters instead of flattened parameters\n  use_orig_params: bool | None\n\n  # Type of state dict to use for saving/loading checkpoints\n  state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n  # Final state dict type to use after training completion\n  final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n  # Policy for automatically wrapping modules with FSDP\n  auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None\n  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')\n  transformer_layer_cls_to_wrap: str | None\n\n  # Reshard parameters after forward pass to save memory\n  reshard_after_forward: bool | None\n  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')\n  mixed_precision_policy: str | None\n\n# FSDP version\nfsdp_version: int | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Number of devices to shard across. If not set, will use all available devices.\ndp_shard_size: int | None\n# Number of devices to replicate across.\ndp_replicate_size: int | None\n# Deprecated: use `context_parallel_size` instead\nsequence_parallel_degree: int | None\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\ncontext_parallel_size: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.\ntensor_parallel_size: int | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n  # For SpecialTokensConfig:\n  bos_token: str | None\n  eos_token: str | None\n  pad_token: str | None\n  unk_token: str | None\n  additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch&gt;=2.6.0\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Whether to checkpoint a model after the first step of training. Defaults to False.\nsave_first_step: bool | None\n\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# Which step to start the profiler at. Useful for only capturing a few steps mid-run.\nprofiler_steps_start: int | None = 0\n# bool of whether to report tokens per second at the end of training. This is not\n# supported with pre-training datasets.\ninclude_tokens_per_second: bool | None\n# bool of whether to report tokens per second per-gpu during training by measuring\n# throughput of non-padding tokens.\ninclude_tkps: bool | None = True\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Weighting of NLL term in loss from RPO paper\nrpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template or path to jinja file for chat template. This will be only used\n# if chat_template is set to `jinja` or `null` (in which case chat_template is\n# automatically set to `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\n# Token index or indices to adjust embedding weights to the mean of the other tokens.\n# This is useful when the model has untrained embeddings.\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\n# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to\n# AutoConfig.\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Whether to save jinja files for tokenizer, transformers default is True\ntokenizer_save_jinja_files: bool | None = True\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Don't move the model to the device before sharding. Set to `false` to revert to legacy\n# behavior.\nexperimental_skip_move_to_device: bool | None = True\n\n# Use custom kernels, e.g. MegaBlocks.\nuse_kernels: bool | None\n\n# Model loading quantization config\nmodel_quantization_config: Literal['Mxfp4Config'] | None\n# kwargs for model quantization config\nmodel_quantization_config_kwargs: dict[str, Any] | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# Whether to save the model using safetensors format. Defaults to True.\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in\n# original model\nadapter: str | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\nlora_target_parameters: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n  # For PeftConfig:\n  # Configuration options for loftq initialization for LoRA\n  loftq_config: LoftQConfig | None\n    # For LoftQConfig:\n    # typically 4 bits\n    loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict\n# mapping an embedding layer name to its trainable token indices. See\n# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-\n# tokens-alongside-lora\npeft_trainable_token_indices: list[int] | dict[str, list[int]] | None\n# Whether to tie adapter weights for tied model weights. See\n# https://github.com/huggingface/peft/issues/2864\npeft_ensure_weight_tying: bool | None\n# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.\npeft_autocast_adapter_dtype: bool | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n\n# Whether to use ReLoRA. Use with jagged_restart_*steps options.\nrelora: bool | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# how often to reset for jagged restarts\njagged_restart_steps: int | None\n# how many warmup steps to take after reset for jagged restarts\njagged_restart_warmup_steps: int | None\n# how many anneal steps to take before reset for jagged restarts\njagged_restart_anneal_steps: int | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n  # For LrGroup:\n  name: str (required)\n  modules: list[str] (required)\n  lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n\n# Dion Optimizer learning rate\ndion_lr: float | None\n# Dion Optimizer momentum\ndion_momentum: float | None\n# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank\n# dimension.\ndion_rank_fraction: float | None = 1.0\n# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may\n# be useful to ensure even sharding.\ndion_rank_multiple_of: int | None = 1\n\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\nuse_trackio: bool | None\n# Your trackio project name\ntrackio_project_name: str | None\n# Set the name of your trackio run\ntrackio_run_name: str | None\n# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)\ntrackio_space_id: str | None\n\n# Enable OpenTelemetry metrics collection and Prometheus export\nuse_otel_metrics: bool | None = False\n# Host to bind the OpenTelemetry metrics server to\notel_metrics_host: str | None = localhost\n# Port for the Prometheus metrics HTTP server\notel_metrics_port: int | None = 8000\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None",
+    "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n# Reinitialize model weights randomly instead of loading pretrained weights\nreinit_weights: bool | None\n\n# module to custom trainer class to use for training\ntrainer_cls: str | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl: RLType | None\n\ntrl: TRLConfig | None\n  # For TRLConfig:\n  # Beta parameter for the RL training. Same as `rl_beta`. Use\n  beta: float | None\n  # Maximum length of the completion for RL training.\n  max_completion_length: int | None\n\n  # Whether to use VLLM for RL training.\n  use_vllm: bool = False\n  # VLLM mode to use, one of 'server' or 'colocate'\n  vllm_mode: Literal['server', 'colocate'] | None\n  # Host of the vLLM server to connect to.\n  vllm_server_host: str | None = 0.0.0.0\n  # Port of the vLLM server to connect to.\n  vllm_server_port: int | None = 8000\n  # Total timeout (in seconds) to wait for the vLLM server to respond.\n  vllm_server_timeout: int | None\n  # Regex for vLLM guided decoding.\n  vllm_guided_decoding_regex: str | None\n\n  # List of reward functions to load. Paths must be importable from current dir.\n  reward_funcs: list[str] | None\n  # List of reward weights for the reward functions.\n  reward_weights: list[float] | None\n  # Number of generations to sample.\n  num_generations: int | None\n  # Whether to log completions.\n  log_completions: bool | None = False\n  # Number of completions to print when log_completions is True.\n  num_completions_to_print: int | None\n  # Controls whether importance sampling ratios are computed at the `'token'` or\n  # `'sequence'` level. For GSPO, use `sequence`, default is None which corresponds to\n  # the original GRPO paper.\n  importance_sampling_level: Literal['sequence', 'token'] | None\n\n  # Whether to sync the reference model.\n  sync_ref_model: bool | None = False\n  # Mixup alpha for the reference model.\n  ref_model_mixup_alpha: float | None = 0.9\n  # Sync steps for the reference model.\n  ref_model_sync_steps: int | None = 64\n  # Whether to scale rewards by their standard deviation.\n  scale_rewards: bool = True\n\n  # Sampling temperature for the GRPO policy.\n  temperature: float | None\n  # Top-p sampling probability for the generation policy.\n  top_p: float | None\n  # Top-k sampling for the generation policy.\n  top_k: int | None\n  # Minimum probability for the generation policy.\n  min_p: float | None\n  # Penalty for tokens that appear in prompt and generated text.\n  repetition_penalty: float | None\n  # Number of iterations per batch (μ) for GRPO.\n  num_iterations: int | None\n  # Epsilon value for clipping in the GRPO algorithm.\n  epsilon: float | None\n  # Upper-bound epsilon value for clipping in the GRPO algorithm.\n  epsilon_high: float | None\n  # Whether to use Liger loss for GRPO.\n  use_liger_loss: bool | None\n  # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n  loss_type: str | None\n  # Whether to exclude truncated completions from loss calculation.\n  mask_truncated_completions: bool = False\n  # Enable sleep mode for vLLM to offload VRAM when idle\n  vllm_enable_sleep_mode: bool | None\n  # Path to custom rollout function. Must be importable from current dir.\n  rollout_func: str | None\n  # Multi-objective reward aggregation strategy. 'sum_then_normalize' (GRPO default):\n  # weights and sums rewards first, then normalizes. 'normalize_then_sum' (GDPO):\n  # normalizes each reward independently, then sums.\n  multi_objective_aggregation: Literal['sum_then_normalize', 'normalize_then_sum'] | None\n\nvllm: VllmConfig | None\n  # For VllmConfig:\n  # Device to use for VLLM\n  device: str | None = auto\n  # Tensor parallel size for VLLM\n  tensor_parallel_size: int | None\n  # Data parallel size for VLLM\n  data_parallel_size: int | None\n  # GPU memory utilization for VLLM\n  gpu_memory_utilization: float | None = 0.9\n  # Data type for VLLM\n  dtype: str | None = auto\n  # Maximum length of the model context for VLLM\n  max_model_len: int | None\n  # Enable prefix caching for VLLM\n  enable_prefix_caching: bool | None\n  # Host for the vLLM server to start on\n  host: str | None = 0.0.0.0\n  # Port of the vLLM server to start on\n  port: int | None = 8000\n\n  # Enable reasoning for VLLM\n  enable_reasoning: bool | None\n  # Reasoning parser for VLLM\n  reasoning_parser: str | None\n\nqat: QATConfig | None\n  # For QATConfig:\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Quantize embedding\n  quantize_embedding: bool | None = False\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n  # The number of steps to apply fake quantization after\n  fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n  # For PTQConfig:\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Whether to quantize the embedding layer.\n  quantize_embedding: bool | None\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n\n# Configuration for dynamic checkpointing (trigger by file or signal). Set 'enabled:\n# true' to activate this feature.\ndynamic_checkpoint: DynamicCheckpointConfig | None\n  # For DynamicCheckpointConfig:\n  # Enable dynamic checkpoint triggering during training. Create a file\n  # 'axolotl_checkpoint.save' in the configured `output_dir` to trigger.\n  enabled: bool = False\n  # Check for trigger file every N steps (reduces I/O overhead). Default: 100\n  check_interval: int = 10\n  # Custom trigger filename (optional). If not specified, defaults to\n  # 'axolotl_checkpoint.save'. Specify a filename (not a full path) to override the\n  # default.\n  trigger_file_path: str = \n\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\n# Coefficient to incentivize the reward model to output mean-zero rewards (proposed by\n# https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.\ncenter_rewards_coefficient: float | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_use_logits_to_keep: bool | None\ndpo_label_smoothing: float | None\ndpo_norm_loss: bool | None\n\n# Whether to use Liger kernel for DPO loss.\ndpo_use_liger_kernel: bool | None\n\ndpo_padding_free: bool | None\ndpo_generate_during_eval: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# If true, each dataset in `datasets` will be shuffled before merging. This allows\n# curriculum learning strategies to be applied at the dataset level. Default is false.\nshuffle_before_merging_datasets: bool | None = False\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n# Number of shards to save the prepared dataset\nnum_dataset_shards_to_save: int | None\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n  # For PretrainingDataset:\n  name: str | None\n  path: str | None\n  split: str | None = train\n  text_column: str | None = text\n  type: str | None = pretrain\n  trust_remote_code: bool | None = False\n  data_files: str | None\n  skip: int | None\n\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_processes: int | None\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_num_proc: int | None\n\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Approximate number of predictions sent to wandb depending on batch size. Enabled above\n# 0. Default is 0\neval_table_size: int | None\n# Total number of tokens generated for predictions sent to wandb. Default is 128\neval_max_new_tokens: int | None\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\n# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before\n# evaluations. Default is 0 (disabled).\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require &gt;=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\n# Enable FP8 mixed precision training using TorchAO. Best used in combination with\n# torch.compile.\nfp8: bool | None\n# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training\n# speed by 10-15% when FSDP is enabled.\nfp8_enable_fsdp_float8_all_gather: bool | None\n# No AMP (automatic mixed precision) - require &gt;=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# Use CUDA tf32 - require &gt;=ampere\ntf32: bool | None\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.\nactivation_offloading: Literal['legacy', 'disk'] | bool | None = False\n\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;\n# 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to\n# 'drop' for backward compatibility.\nexcess_length_strategy: Literal['drop', 'truncate', 'raise'] | None\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int | None\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (&lt;%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to\n# True if `sample_packing` enabled\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use streaming mode for loading datasets\nstreaming: bool | None\n# Buffer size for multipack streaming datasets\nstreaming_multipack_buffer_size: int | None = 10000\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n# Whether to use SageAttention https://github.com/thu-ml/SageAttention\nsage_attention: bool | None\n\neager_attention: bool | None\n\n# Specify a custom attention implementation, used mostly for kernels.\nattn_implementation: str | None\n\n# Which experts implementation to use for MoE models,\nexperts_implementation: str | None\n\n# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399\nscaling_softmax: bool | None\n# Scaling factor for SSMax attention. Default is 0.43\nscaling_softmax_factor: float | None\n# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better\n# length generalization.\nscaling_softmax_bias: float | None\n\nunsloth_cross_entropy_loss: bool | None\nunsloth_lora_mlp: bool | None\nunsloth_lora_qkv: bool | None\nunsloth_lora_o: bool | None\nunsloth_rms_norm: bool | None\nunsloth_rope: bool | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n# Enable Entropy-Aware Focal Training loss (EAFT)\nuse_eaft: bool | None\n# Exponent for entropy weighting in EAFT (default: 1.0)\neaft_alpha: float | None = 1.0\n# Number of top logits for entropy approximation (default: 20)\neaft_k: int | None = 20\n\n# Whether to use ALST tiled mlp for memory efficient long context\ntiled_mlp: bool | None\n\n# Number of shards to use for ALST tiled mlp. If unset, it will be set based on\n# seqlen/hidden_size\ntiled_mlp_num_shards: int | None\n\n# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on\n# llama.\ntiled_mlp_use_original_mlp: bool | None = True\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# Whether to use deepcompile for faster training with deepspeed\ndeepcompile: bool | None\n# FSDP configuration\nfsdp: list[str] | None\n\n# FSDP configuration options\nfsdp_config: FSDPConfig | None\n  # For FSDPConfig:\n  # FSDP version\n  fsdp_version: int | None\n  # Enable activation checkpointing to reduce memory usage during forward passes\n  activation_checkpointing: bool | None\n  # Offload parameters to CPU to reduce GPU memory usage\n  offload_params: bool | None\n  # Synchronize module states across all processes\n  sync_module_states: bool | None\n  # Enable CPU RAM efficient loading to reduce memory usage during model loading\n  cpu_ram_efficient_loading: bool | None\n  # Disabling this enables swap memory usage for resource-constrained setups when\n  # offload_params is enabled.\n  cpu_offload_pin_memory: bool | None\n  # Use original parameters instead of flattened parameters\n  use_orig_params: bool | None\n\n  # Type of state dict to use for saving/loading checkpoints\n  state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n  # Final state dict type to use after training completion\n  final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n  # Policy for automatically wrapping modules with FSDP\n  auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None\n  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')\n  transformer_layer_cls_to_wrap: str | None\n\n  # Reshard parameters after forward pass to save memory\n  reshard_after_forward: bool | None\n  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')\n  mixed_precision_policy: str | None\n\n# FSDP version\nfsdp_version: int | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Number of devices to shard across. If not set, will use all available devices.\ndp_shard_size: int | None\n# Number of devices to replicate across.\ndp_replicate_size: int | None\n# Deprecated: use `context_parallel_size` instead\nsequence_parallel_degree: int | None\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\ncontext_parallel_size: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.\ntensor_parallel_size: int | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n  # For SpecialTokensConfig:\n  bos_token: str | None\n  eos_token: str | None\n  pad_token: str | None\n  unk_token: str | None\n  additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch&gt;=2.6.0\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Whether to checkpoint a model after the first step of training. Defaults to False.\nsave_first_step: bool | None\n\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# Which step to start the profiler at. Useful for only capturing a few steps mid-run.\nprofiler_steps_start: int | None = 0\n# bool of whether to report tokens per second at the end of training. This is not\n# supported with pre-training datasets.\ninclude_tokens_per_second: bool | None\n# bool of whether to report tokens per second per-gpu during training by measuring\n# throughput of non-padding tokens.\ninclude_tkps: bool | None = True\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Weighting of NLL term in loss from RPO paper\nrpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template or path to jinja file for chat template. This will be only used\n# if chat_template is set to `jinja` or `null` (in which case chat_template is\n# automatically set to `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\n# Token index or indices to adjust embedding weights to the mean of the other tokens.\n# This is useful when the model has untrained embeddings.\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\n# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to\n# AutoConfig.\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Whether to save jinja files for tokenizer, transformers default is True\ntokenizer_save_jinja_files: bool | None = True\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Don't move the model to the device before sharding. Set to `false` to revert to legacy\n# behavior.\nexperimental_skip_move_to_device: bool | None = True\n\n# Use custom kernels, e.g. MegaBlocks.\nuse_kernels: bool | None\n\n# Model loading quantization config\nmodel_quantization_config: Literal['Mxfp4Config'] | None\n# kwargs for model quantization config\nmodel_quantization_config_kwargs: dict[str, Any] | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# branch/revision to push to on hub (default: main)\nhub_revision: str | None\n# Whether to save the model using safetensors format. Defaults to True.\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in\n# original model\nadapter: str | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\nlora_target_parameters: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n  # For PeftConfig:\n  # Configuration options for loftq initialization for LoRA\n  loftq_config: LoftQConfig | None\n    # For LoftQConfig:\n    # typically 4 bits\n    loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict\n# mapping an embedding layer name to its trainable token indices. See\n# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-\n# tokens-alongside-lora\npeft_trainable_token_indices: list[int] | dict[str, list[int]] | None\n# Whether to tie adapter weights for tied model weights. See\n# https://github.com/huggingface/peft/issues/2864\npeft_ensure_weight_tying: bool | None\n# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.\npeft_autocast_adapter_dtype: bool | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n\n# Whether to use ReLoRA. Use with jagged_restart_*steps options.\nrelora: bool | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# how often to reset for jagged restarts\njagged_restart_steps: int | None\n# how many warmup steps to take after reset for jagged restarts\njagged_restart_warmup_steps: int | None\n# how many anneal steps to take before reset for jagged restarts\njagged_restart_anneal_steps: int | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n  # For LrGroup:\n  name: str (required)\n  modules: list[str] (required)\n  lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n\n# Dion Optimizer learning rate\ndion_lr: float | None\n# Dion Optimizer momentum\ndion_momentum: float | None\n# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank\n# dimension.\ndion_rank_fraction: float | None = 1.0\n# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may\n# be useful to ensure even sharding.\ndion_rank_multiple_of: int | None = 1\n\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\nuse_trackio: bool | None\n# Your trackio project name\ntrackio_project_name: str | None\n# Set the name of your trackio run\ntrackio_run_name: str | None\n# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)\ntrackio_space_id: str | None\n\n# Enable OpenTelemetry metrics collection and Prometheus export\nuse_otel_metrics: bool | None = False\n# Host to bind the OpenTelemetry metrics server to\notel_metrics_host: str | None = localhost\n# Port for the Prometheus metrics HTTP server\notel_metrics_port: int | None = 8000\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None",
     "crumbs": [
       "Getting Started",
       "Config Reference"
@@ -975,7 +975,7 @@
     "href": "docs/multimodal.html",
     "title": "MultiModal / Vision Language Models (BETA)",
     "section": "",
-    "text": "Mllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nMagistral-Small-2509\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nSmolVLM2\nLFM2-VL\nIntern-VL",
+    "text": "Mllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nMagistral-Small-2509\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nGLM-4.6V\nSmolVLM2\nLFM2-VL\nIntern-VL",
     "crumbs": [
       "How To Guides",
       "MultiModal / Vision Language Models (BETA)"
@@ -986,7 +986,7 @@
     "href": "docs/multimodal.html#supported-models",
     "title": "MultiModal / Vision Language Models (BETA)",
     "section": "",
-    "text": "Mllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nMagistral-Small-2509\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nSmolVLM2\nLFM2-VL\nIntern-VL",
+    "text": "Mllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nMagistral-Small-2509\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nGLM-4.6V\nSmolVLM2\nLFM2-VL\nIntern-VL",
     "crumbs": [
       "How To Guides",
       "MultiModal / Vision Language Models (BETA)"
@@ -997,7 +997,7 @@
     "href": "docs/multimodal.html#usage",
     "title": "MultiModal / Vision Language Models (BETA)",
     "section": "Usage",
-    "text": "Usage\nMultimodal support is limited and doesn’t have full feature parity.\nHere are the hyperparams you’ll need to use to finetune a multimodal model.\nprocessor_type: AutoProcessor\n\nskip_prepare_dataset: true\nremove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training\nsample_packing: false  # not yet supported with multimodal\n\nchat_template:  # see in next section if specified\n\n# example dataset\ndatasets:\n  - path: HuggingFaceH4/llava-instruct-mix-vsft\n    type: chat_template\n    split: train[:1%]\n\n# (optional) if doing lora, only finetune the Language model,\n# leave the vision model and vision tower frozen\n# load_in_8bit: true\nadapter: lora\nlora_target_modules: 'model.language_model.layers.[\\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'\n\n# (optional) if you want to resize images to a set size\nimage_size: 512\nimage_resize_algorithm: bilinear\nPlease see examples folder for full configs.\n\n\n\n\n\n\nTip\n\n\n\nSome of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.\n\n\n\n\n\n\n\n\nNote\n\n\n\nAs of now, we do not truncate nor drop samples based on sequence_len as each arch has different ways to process non-text tokens. We are looking for help on this.\n\n\n\nMllama\nbase_model: meta-llama/Llama-3.2-11B-Vision-Instruct\n\nchat_template: llama3_2_vision\n\n\nLlama4\nbase_model: meta-llama/Llama-4-Scout-17B-16E-Instruct\n\nchat_template: llama4\n\n\nPixtral\nbase_model: mistralai/Pixtral-12B-2409\n\nchat_template: pixtral\n\n\nLlava-1.5\nbase_model: llava-hf/llava-1.5-7b-hf\n\nchat_template: llava\n\n\nMistral-Small-3.1\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install vision lib via pip install 'mistral-common[opencv]==1.8.5'\n\n\nbase_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503\n\n\nMagistral-Small-2509\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install vision lib via pip install 'mistral-common[opencv]==1.8.5'\n\n\nbase_model: mistralai/Magistral-Small-2509\n\n\nVoxtral\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install audio lib via pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'\n\n\nbase_model: mistralai/Voxtral-Mini-3B-2507\n\nprocessor_type: VoxtralProcessor\n\n\nGemma-3\n\n\n\n\n\n\nTip\n\n\n\nThe Gemma3-1B model is a text-only model, so please train as regular text model.\n\n\nFor multi-modal 4B/12B/27B models, use the following config:\nbase_model: google/gemma-3-4b-it\n\nchat_template: gemma3\n\n\nGemma-3n\n\n\n\n\n\n\nWarning\n\n\n\nThe model’s initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers.\n\n\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install timm via pip3 install timm==1.0.17\n\n\nbase_model: google/gemma-3n-E2B-it\n\nchat_template: gemma3n\n\n\nQwen2-VL\nbase_model: Qwen/Qwen2-VL-7B-Instruct\n\nchat_template: qwen2_vl\n\n\nQwen2.5-VL\nbase_model: Qwen/Qwen2.5-VL-7B-Instruct\n\nchat_template: qwen2_vl  # same as qwen2-vl\n\n\nQwen3-VL\nbase_model: Qwen/Qwen3-VL-4B-Instruct\n\nchat_template: qwen2_vl  # same as qwen2-vl\n\n\nSmolVLM2\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install num2words via pip3 install num2words==0.5.14\n\n\nbase_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct\n\n\nLFM2-VL\n\n\n\n\n\n\nWarning\n\n\n\nPlease uninstall causal-conv1d via pip3 uninstall -y causal-conv1d\n\n\nbase_model: LiquidAI/LFM2-VL-450M\n\n\nIntern-VL\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install timm via pip3 install timm==1.0.19\n\n\nbase_model: OpenGVLab/InternVL3_5-8B",
+    "text": "Usage\nMultimodal support is limited and doesn’t have full feature parity.\nHere are the hyperparams you’ll need to use to finetune a multimodal model.\nprocessor_type: AutoProcessor\n\nskip_prepare_dataset: true\nremove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training\nsample_packing: false  # not yet supported with multimodal\n\nchat_template:  # see in next section if specified\n\n# example dataset\ndatasets:\n  - path: HuggingFaceH4/llava-instruct-mix-vsft\n    type: chat_template\n    split: train[:1%]\n\n# (optional) if doing lora, only finetune the Language model,\n# leave the vision model and vision tower frozen\n# load_in_8bit: true\nadapter: lora\nlora_target_modules: 'model.language_model.layers.[\\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'\n\n# (optional) if you want to resize images to a set size\nimage_size: 512\nimage_resize_algorithm: bilinear\nPlease see examples folder for full configs.\n\n\n\n\n\n\nTip\n\n\n\nSome of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.\n\n\n\n\n\n\n\n\nNote\n\n\n\nAs of now, we do not truncate nor drop samples based on sequence_len as each arch has different ways to process non-text tokens. We are looking for help on this.\n\n\n\nMllama\nbase_model: meta-llama/Llama-3.2-11B-Vision-Instruct\n\nchat_template: llama3_2_vision\n\n\nLlama4\nbase_model: meta-llama/Llama-4-Scout-17B-16E-Instruct\n\nchat_template: llama4\n\n\nPixtral\nbase_model: mistralai/Pixtral-12B-2409\n\nchat_template: pixtral\n\n\nLlava-1.5\nbase_model: llava-hf/llava-1.5-7b-hf\n\nchat_template: llava\n\n\nMistral-Small-3.1\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install vision lib via pip install 'mistral-common[opencv]==1.8.5'\n\n\nbase_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503\n\n\nMagistral-Small-2509\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install vision lib via pip install 'mistral-common[opencv]==1.8.5'\n\n\nbase_model: mistralai/Magistral-Small-2509\n\n\nVoxtral\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install audio lib via pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'\n\n\nbase_model: mistralai/Voxtral-Mini-3B-2507\n\nprocessor_type: VoxtralProcessor\n\n\nGemma-3\n\n\n\n\n\n\nTip\n\n\n\nThe Gemma3-1B model is a text-only model, so please train as regular text model.\n\n\nFor multi-modal 4B/12B/27B models, use the following config:\nbase_model: google/gemma-3-4b-it\n\nchat_template: gemma3\n\n\nGemma-3n\n\n\n\n\n\n\nWarning\n\n\n\nThe model’s initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers.\n\n\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install timm via pip3 install timm==1.0.17\n\n\nbase_model: google/gemma-3n-E2B-it\n\nchat_template: gemma3n\n\n\nQwen2-VL\nbase_model: Qwen/Qwen2-VL-7B-Instruct\n\nchat_template: qwen2_vl\n\n\nQwen2.5-VL\nbase_model: Qwen/Qwen2.5-VL-7B-Instruct\n\nchat_template: qwen2_vl  # same as qwen2-vl\n\n\nQwen3-VL\nbase_model: Qwen/Qwen3-VL-4B-Instruct\n\nchat_template: qwen2_vl  # same as qwen2-vl\n\n\nGLM-4.6V\nBoth GLM-4.6V (106B MoE) and GLM-4.6V-Flash (9B) are supported.\n# GLM-4.6V (106B MoE version)\nbase_model: zai-org/GLM-4.6V\n\n# OR GLM-4.6V-Flash (9B version)\nbase_model: zai-org/GLM-4.6V-Flash\n\n\nSmolVLM2\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install num2words via pip3 install num2words==0.5.14\n\n\nbase_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct\n\n\nLFM2-VL\n\n\n\n\n\n\nWarning\n\n\n\nPlease uninstall causal-conv1d via pip3 uninstall -y causal-conv1d\n\n\nbase_model: LiquidAI/LFM2-VL-450M\n\n\nIntern-VL\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install timm via pip3 install timm==1.0.19\n\n\nbase_model: OpenGVLab/InternVL3_5-8B",
     "crumbs": [
       "How To Guides",
       "MultiModal / Vision Language Models (BETA)"
@@ -1114,1573 +1114,2599 @@
     ]
   },
   {
-    "objectID": "docs/torchao.html",
-    "href": "docs/torchao.html",
-    "title": "PyTorch ao",
+    "objectID": "docs/fsdp_qlora.html",
+    "href": "docs/fsdp_qlora.html",
+    "title": "FSDP + QLoRA",
     "section": "",
-    "text": "To use experimental optimizers (AdamWFp8, AdamW4bit, AdamW8bit) from Pytorch Ao, please install the package as shown below.\n\n\n\n\n\n\nTip\n\n\n\nSome experimental optimizers are already present in regular Pytorch, so please re-check if you actually need this package!\n\n\n\nInstallation\nStable Release from the PyTorch index\npip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124\nNightly release\npip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124",
+    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
     "crumbs": [
       "Advanced Features",
-      "PyTorch ao"
+      "FSDP + QLoRA"
     ]
   },
   {
-    "objectID": "docs/optimizers.html",
-    "href": "docs/optimizers.html",
-    "title": "Optimizers",
+    "objectID": "docs/fsdp_qlora.html#background",
+    "href": "docs/fsdp_qlora.html#background",
+    "title": "FSDP + QLoRA",
     "section": "",
-    "text": "Axolotl supports all optimizers supported by transformers OptimizerNames\nHere is a list of optimizers supported by transformers as of v4.54.0:\n\nadamw_torch\nadamw_torch_fused\nadamw_torch_xla\nadamw_torch_npu_fused\nadamw_apex_fused\nadafactor\nadamw_anyprecision\nadamw_torch_4bit\nadamw_torch_8bit\nademamix\nsgd\nadagrad\nadamw_bnb_8bit\nadamw_8bit # alias for adamw_bnb_8bit\nademamix_8bit\nlion_8bit\nlion_32bit\npaged_adamw_32bit\npaged_adamw_8bit\npaged_ademamix_32bit\npaged_ademamix_8bit\npaged_lion_32bit\npaged_lion_8bit\nrmsprop\nrmsprop_bnb\nrmsprop_bnb_8bit\nrmsprop_bnb_32bit\ngalore_adamw\ngalore_adamw_8bit\ngalore_adafactor\ngalore_adamw_layerwise\ngalore_adamw_8bit_layerwise\ngalore_adafactor_layerwise\nlomo\nadalomo\ngrokadamw\nschedule_free_radam\nschedule_free_adamw\nschedule_free_sgd\napollo_adamw\napollo_adamw_layerwise\nstable_adamw",
+    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
     "crumbs": [
-      "Core Concepts",
-      "Optimizers"
+      "Advanced Features",
+      "FSDP + QLoRA"
     ]
   },
   {
-    "objectID": "docs/optimizers.html#overview",
-    "href": "docs/optimizers.html#overview",
-    "title": "Optimizers",
+    "objectID": "docs/fsdp_qlora.html#usage",
+    "href": "docs/fsdp_qlora.html#usage",
+    "title": "FSDP + QLoRA",
+    "section": "Usage",
+    "text": "Usage\nTo enable QLoRA with FSDP, you need to perform the following steps:\n\n![Tip]\nSee the example config file in addition to reading these instructions.\n\n\nSet adapter: qlora in your axolotl config file.\nEnable FSDP in your axolotl config, as described here.\nUse one of the supported model types: llama, mistral or mixtral.",
+    "crumbs": [
+      "Advanced Features",
+      "FSDP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#enabling-swap-for-fsdp2",
+    "href": "docs/fsdp_qlora.html#enabling-swap-for-fsdp2",
+    "title": "FSDP + QLoRA",
+    "section": "Enabling Swap for FSDP2",
+    "text": "Enabling Swap for FSDP2\nIf available memory is insufficient even after FSDP’s CPU offloading, you can enable swap memory usage by setting cpu_offload_pin_memory: false alongside offload_params: true in FSDP config.\nThis disables memory pinning, allowing FSDP to use disk swap space as fallback. Disabling memory pinning itself incurs performance overhead, and actually having to use swap adds more, but it may enable training larger models that would otherwise cause OOM errors on resource constrained systems.",
+    "crumbs": [
+      "Advanced Features",
+      "FSDP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#example-config",
+    "href": "docs/fsdp_qlora.html#example-config",
+    "title": "FSDP + QLoRA",
+    "section": "Example Config",
+    "text": "Example Config\nexamples/llama-2/qlora-fsdp.yml contains an example of how to enable QLoRA + FSDP in axolotl.",
+    "crumbs": [
+      "Advanced Features",
+      "FSDP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#references",
+    "href": "docs/fsdp_qlora.html#references",
+    "title": "FSDP + QLoRA",
+    "section": "References",
+    "text": "References\n\nPR #1378 enabling QLoRA in FSDP in Axolotl.\nBlog Post from the Answer.AI team describing the work that enabled QLoRA in FSDP.\nRelated HuggingFace PRs Enabling FDSP + QLoRA:\n\nAccelerate PR#2544\nTransformers PR#29587\nTRL PR#1416\nPEFT PR#1550",
+    "crumbs": [
+      "Advanced Features",
+      "FSDP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#footnotes",
+    "href": "docs/fsdp_qlora.html#footnotes",
+    "title": "FSDP + QLoRA",
+    "section": "Footnotes",
+    "text": "Footnotes\n\n\nThis was enabled by this work from the Answer.AI team.↩︎",
+    "crumbs": [
+      "Advanced Features",
+      "FSDP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/quantize.html",
+    "href": "docs/quantize.html",
+    "title": "Quantization with torchao",
     "section": "",
-    "text": "Axolotl supports all optimizers supported by transformers OptimizerNames\nHere is a list of optimizers supported by transformers as of v4.54.0:\n\nadamw_torch\nadamw_torch_fused\nadamw_torch_xla\nadamw_torch_npu_fused\nadamw_apex_fused\nadafactor\nadamw_anyprecision\nadamw_torch_4bit\nadamw_torch_8bit\nademamix\nsgd\nadagrad\nadamw_bnb_8bit\nadamw_8bit # alias for adamw_bnb_8bit\nademamix_8bit\nlion_8bit\nlion_32bit\npaged_adamw_32bit\npaged_adamw_8bit\npaged_ademamix_32bit\npaged_ademamix_8bit\npaged_lion_32bit\npaged_lion_8bit\nrmsprop\nrmsprop_bnb\nrmsprop_bnb_8bit\nrmsprop_bnb_32bit\ngalore_adamw\ngalore_adamw_8bit\ngalore_adafactor\ngalore_adamw_layerwise\ngalore_adamw_8bit_layerwise\ngalore_adafactor_layerwise\nlomo\nadalomo\ngrokadamw\nschedule_free_radam\nschedule_free_adamw\nschedule_free_sgd\napollo_adamw\napollo_adamw_layerwise\nstable_adamw",
+    "text": "Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the torchao library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).",
     "crumbs": [
-      "Core Concepts",
-      "Optimizers"
+      "How To Guides",
+      "Quantization with torchao"
     ]
   },
   {
-    "objectID": "docs/optimizers.html#custom-optimizers",
-    "href": "docs/optimizers.html#custom-optimizers",
-    "title": "Optimizers",
-    "section": "Custom Optimizers",
-    "text": "Custom Optimizers\nEnable custom optimizers by passing a string to the optimizer argument. Each optimizer will receive beta and epsilon args, however, some may accept additional args which are detailed below.\n\noptimi_adamw\noptimizer: optimi_adamw\n\n\nao_adamw_4bit\nDeprecated: Please use adamw_torch_4bit.\n\n\nao_adamw_8bit\nDeprecated: Please use adamw_torch_8bit.\n\n\nao_adamw_fp8\noptimizer: ao_adamw_fp8\n\n\nadopt_adamw\nGitHub: https://github.com/iShohei220/adopt\nPaper: https://arxiv.org/abs/2411.02853\noptimizer: adopt_adamw\n\n\ncame_pytorch\nGitHub: https://github.com/yangluo7/CAME/tree/master\nPaper: https://arxiv.org/abs/2307.02047\noptimizer: came_pytorch\n\n# optional args (defaults below)\nadam_beta1: 0.9\nadam_beta2: 0.999\nadam_beta3: 0.9999\nadam_epsilon: 1e-30\nadam_epsilon2: 1e-16\n\n\nmuon\nBlog: https://kellerjordan.github.io/posts/muon/\nPaper: https://arxiv.org/abs/2502.16982v1\noptimizer: muon\n\n\ndion\nMicrosoft’s Dion (DIstributed OrthoNormalization) optimizer is a scalable and communication-efficient\northonormalizing optimizer that uses low-rank approximations to reduce gradient communication.\nGitHub: https://github.com/microsoft/dion\nPaper: https://arxiv.org/pdf/2504.05295\nNote: Implementation written for PyTorch 2.7+ for DTensor\noptimizer: dion\ndion_lr: 0.01\ndion_momentum: 0.95\nlr: 0.00001  # learning rate for embeddings and parameters that fallback to AdamW",
+    "objectID": "docs/quantize.html#configuring-quantization-in-axolotl",
+    "href": "docs/quantize.html#configuring-quantization-in-axolotl",
+    "title": "Quantization with torchao",
+    "section": "Configuring Quantization in Axolotl",
+    "text": "Configuring Quantization in Axolotl\nQuantization is configured using the quantization key in your configuration file.\nbase_model: # The path to the model to quantize.\nquantization:\n  activation_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for activation quantization. Valid options are \"int4\", \"int8\", \"float8\"\n  weight_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for weight quantization. Valid options are \"int4\", \"fp8\", and \"nvfp4\".\n  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization\n  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.\n\noutput_dir:  # The path to the output directory.\nOnce quantization is complete, your quantized model will be saved in the {output_dir}/quantized directory.\nYou may also use the quantize command to quantize a model which has been trained with QAT - you can do this by using the existing QAT configuration file which\nyou used to train the model:\n# qat.yml\nqat:\n  activation_dtype: int8\n  weight_dtype: int4\n  group_size: 256\n\noutput_dir: # The path to the output directory used during training where the final checkpoint has been saved.\naxolotl quantize qat.yml\nThis ensures that an identical quantization configuration is used to quantize the model as was used to train it.\n\n\n\n\n\n\nNote\n\n\n\nIf you have configured pushing to hub with hub_model_id, your model hub name will have the quantization schema appended to it,\ne.g. axolotl-ai-cloud/qat-nvfp4-llama3B will become axolotl-ai-cloud/qat-nvfp4-llama3B-nvfp4w",
     "crumbs": [
-      "Core Concepts",
-      "Optimizers"
+      "How To Guides",
+      "Quantization with torchao"
     ]
   },
   {
-    "objectID": "docs/models/apertus.html",
-    "href": "docs/models/apertus.html",
-    "title": "Apertus",
+    "objectID": "docs/reward_modelling.html",
+    "href": "docs/reward_modelling.html",
+    "title": "Reward Modelling",
     "section": "",
-    "text": "Apertus is a family of opensource models trained by Swiss-ai.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "text": "Overview\nReward modelling is a technique used to train models to predict the reward or value of a given input. This is particularly useful in reinforcement learning scenarios where the model needs to evaluate the quality of its actions or predictions.\nWe support the reward modelling techniques supported by trl.\n\n\n(Outcome) Reward Models\nOutcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step).\nFor improved training stability, you can use the center_rewards_coefficient parameter to encourage mean-zero reward outputs (see TRL docs).\nbase_model: google/gemma-2-2b\nmodel_type: AutoModelForSequenceClassification\nnum_labels: 1\ntokenizer_type: AutoTokenizer\n\nreward_model: true\nchat_template: gemma\ndatasets:\n  - path: argilla/distilabel-intel-orca-dpo-pairs\n    type: bradley_terry.chat_template\n\nval_set_size: 0.1\neval_steps: 100\nBradley-Terry chat templates expect single-turn conversations in the following format:\n{\n    \"system\": \"...\", // optional\n    \"input\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nProcess Reward Models (PRM)\n\n\n\n\n\n\nTip\n\n\n\nCheck out our PRM blog.\n\n\nProcess reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.\nbase_model: Qwen/Qwen2.5-3B\nmodel_type: AutoModelForTokenClassification\nnum_labels: 2\n\nprocess_reward_model: true\ndatasets:\n  - path: trl-lib/math_shepherd\n    type: stepwise_supervised\n    split: train\n\nval_set_size: 0.1\neval_steps: 100\nPlease see stepwise_supervised for more details on the dataset format.",
+    "crumbs": [
+      "How To Guides",
+      "Reward Modelling"
+    ]
+  },
+  {
+    "objectID": "docs/models/plano.html",
+    "href": "docs/models/plano.html",
+    "title": "Plano Orchestrator",
+    "section": "",
+    "text": "Plano-Orchestrator is a family of 4B and 30B-A3B routing and orchestration models designed for multi-agent systems. It analyzes user intent and conversation context to make precise routing decisions, excelling at multi-turn context understanding, multi-intent detection, and context-dependent routing.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Apertus"
+      "Plano Orchestrator"
     ]
   },
   {
-    "objectID": "docs/models/apertus.html#getting-started",
-    "href": "docs/models/apertus.html#getting-started",
-    "title": "Apertus",
+    "objectID": "docs/models/plano.html#getting-started",
+    "href": "docs/models/plano.html#getting-started",
+    "title": "Plano Orchestrator",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as Apertus is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\n(Optional, highly recommended) Install XIELU CUDA\n\n## Recommended for reduced VRAM and faster speeds\n\n# Point to CUDA toolkit directory\n# For those using our Docker image, use the below path.\nexport CUDA_HOME=/usr/local/cuda\n\npip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps\nFor any installation errors, see XIELU Installation Issues\n\nRun the finetuning example:\n\naxolotl train examples/apertus/apertus-8b-qlora.yaml\nThis config uses about 8.7 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTips\n\nFor inference, the official Apertus team recommends top_p=0.9 and temperature=0.8.\nYou can instead use full paremter fine-tuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\n\n\n\nXIELU Installation Issues\n\nModuleNotFoundError: No module named 'torch'\nPlease check these one by one:\n- Running in correct environment\n- Env has PyTorch installed\n- CUDA toolkit is at CUDA_HOME\nIf those didn’t help, please try the below solutions:\n\nPass env for CMAKE and try install again:\nPython_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps\nGit clone the repo and manually hardcode python path:\ngit clone https://github.com/nickjbrowning/XIELU\ncd xielu\ngit checkout 59d6031\n\ncd xielu\nnano CMakeLists.txt  # or vi depending on your preference\nexecute_process(\n-    COMMAND ${Python_EXECUTABLE} -c \"import torch.utils; print(torch.utils.cmake_prefix_path)\"\n+    COMMAND /root/miniconda3/envs/py3.11/bin/python -c \"import torch.utils; print(torch.utils.cmake_prefix_path)\"\n    RESULT_VARIABLE TORCH_CMAKE_PATH_RESULT\n    OUTPUT_VARIABLE TORCH_CMAKE_PATH_OUTPUT\n    ERROR_VARIABLE TORCH_CMAKE_PATH_ERROR\n)\npip3 install . --no-build-isolation --no-deps",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/plano/plano-4b-qlora.yaml\n\nThis config uses about 5.1 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀\n\nOrchestration Prompt\nPlano-Orchestrator uses a specific orchestration prompt format for routing/agent decisions. Please check the official model card for proper prompt formatting and the ORCHESTRATION_PROMPT template.\n\n\nTips\n\nTo use the larger Plano-Orchestrator-30B-A3B MoE model, simply change base_model: katanemo/Plano-Orchestrator-30B-A3B in the config and enable multi-GPU training if needed.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Apertus"
+      "Plano Orchestrator"
     ]
   },
   {
-    "objectID": "docs/models/apertus.html#optimization-guides",
-    "href": "docs/models/apertus.html#optimization-guides",
-    "title": "Apertus",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Apertus"
-    ]
-  },
-  {
-    "objectID": "docs/models/apertus.html#related-resources",
-    "href": "docs/models/apertus.html#related-resources",
-    "title": "Apertus",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nApertus Tech Report\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Apertus"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3/think.html",
-    "href": "docs/models/ministral3/think.html",
-    "title": "Ministral 3 Thinking",
-    "section": "",
-    "text": "This guide covers fine-tuning Ministral3 2512 with thinking capabilities using Axolotl. The thinking model enables explicit Chain-of-Thought reasoning with separate thinking and response sections.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral3",
-      "Ministral 3 Thinking"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3/think.html#prerequisites",
-    "href": "docs/models/ministral3/think.html#prerequisites",
-    "title": "Ministral 3 Thinking",
-    "section": "Prerequisites",
-    "text": "Prerequisites\nBefore starting, ensure you have:\n\nInstalled Axolotl (see main README)",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral3",
-      "Ministral 3 Thinking"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3/think.html#getting-started",
-    "href": "docs/models/ministral3/think.html#getting-started",
-    "title": "Ministral 3 Thinking",
-    "section": "Getting Started",
-    "text": "Getting Started\nRun the thinking model fine-tuning:\naxolotl train examples/ministral3/think/ministral3-3b-think-qlora.yaml\nThis config uses about 4.76 GiB VRAM.\n\nTips\n\nDataset uses multi-content format with type: thinking support. See Dataset Format below.\nYou cannot mix content: str and content: list[dict], otherwise, dataset loading will fail. Keep it consistent.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral3",
-      "Ministral 3 Thinking"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3/think.html#dataset-format",
-    "href": "docs/models/ministral3/think.html#dataset-format",
-    "title": "Ministral 3 Thinking",
-    "section": "Dataset Format",
-    "text": "Dataset Format\nThe thinking model requires the multi-content dataset format with support for an extra role: thinking within system and assistant messages.\nExample format:\n{\n    \"messages\": [\n        {\n            \"role\": \"system\",\n            \"content\": [\n                { \"type\": \"text\", \"text\": \"{SYSTEM_PROMPT}\"}\n            ]\n        },\n        {\n            \"role\": \"user\",\n            \"content\": [\n                { \"type\": \"text\", \"text\": \"Solve this step by step: What is 15% of 240?\"}\n            ]\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": [\n                {\n                    \"type\": \"thinking\",\n                    \"thinking\": \"I need to calculate 15% of 240. First, I'll convert 15% to decimal: 0.15. Then multiply: 0.15 × 240 = 36.\"\n                },\n                {\n                    \"type\": \"text\",\n                    \"text\": \"To find 15% of 240, I'll multiply 240 by 0.15:\\n\\n240 × 0.15 = 36\\n\\nTherefore, 15% of 240 is 36.\"\n                }\n            ]\n        }\n    ]\n}\n\nAdvanced Options\nThe thinking section supports an optional closed parameter:\n{\n    \"type\": \"thinking\",\n    \"thinking\": \"Internal reasoning here...\",\n    \"closed\": true  // Default: true, controls adding the closing [/THINK] tag\n}",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral3",
-      "Ministral 3 Thinking"
-    ]
-  },
-  {
-    "objectID": "docs/models/gpt-oss.html",
-    "href": "docs/models/gpt-oss.html",
-    "title": "GPT-OSS",
-    "section": "",
-    "text": "GPT-OSS are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B.\nIn October 2025, OpenAI released safeguard models built upon GPT-OSS called GPT-OSS-Safeguard. They use the same architecture, so the same examples below can be re-used.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "GPT-OSS"
-    ]
-  },
-  {
-    "objectID": "docs/models/gpt-oss.html#getting-started",
-    "href": "docs/models/gpt-oss.html#getting-started",
-    "title": "GPT-OSS",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nChoose one of the following configs below for training the 20B model. (for 120B, see below)\n\n# LoRA SFT linear layers (1x48GB @ ~44GiB)\naxolotl train examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml\n\n# FFT SFT with offloading (2x24GB @ ~21GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml\n\n# FFT SFT (8x48GB @ ~36GiB/GPU or 4x80GB @ ~46GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml\nNote: Memory usage taken from device_mem_reserved(gib) from logs.\n\nTraining 120B\nOn 8xH100s, make sure you have ~3TB of free disk space. With each checkpoint clocking in at ~720GB, along with the base\nmodel, and final model output, you may need at least 3TB of free disk space to keep at least 2 checkpoints.\n# FFT SFT with offloading (8x80GB @ ~49GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml\nTo simplify fine-tuning across 2 nodes × 8x H100 (80GB) GPUs, we’ve partnered with Baseten to showcase multi-node\ntraining of the 120B model using Baseten Truss. You can read more about this recipe on\nBaseten’s blog. The recipe can\nbe found on their\nGitHub.\nERRATA: Transformers saves the model Architecture prefixed with FSDP which needs to be manually renamed in config.json.\nSee https://github.com/huggingface/transformers/pull/40207 for the status of this issue.\nsed -i 's/FSDPGptOssForCausalLM/GptOssForCausalLM/g' ./outputs/gpt-oss-out/config.json\nWhen using SHARDED_STATE_DICT with FSDP, the final checkpoint should automatically merge the sharded weights to your\nconfigured output_dir. However, if that step fails due to a disk space error, you can take an additional step to\nmerge the sharded weights. This step will automatically determine the last checkpoint directory and merge the sharded\nweights to {output_dir}/merged.\naxolotl merge-sharded-fsdp-weights examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml\nmv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/\n\n\nHow to set reasoning_effort in template?\nThe harmony template has a feature to set the reasoning_effort during prompt building. The default is medium. If you would like to adjust this, you can add the following to your config:\nchat_template_kwargs:\n  reasoning_effort: \"high\"  # low | medium | high\nCurrently, this applies globally. There is no method to apply per sample yet. If you are interested in adding this, please feel free to create an Issue to discuss.\n\n\nInferencing your fine-tuned model\n\nvLLM\nGPT-OSS support in vLLM does not exist in a stable release yet. See https://x.com/MaziyarPanahi/status/1955741905515323425\nfor more information about using a special vllm-openai docker image for inferencing with vLLM.\nOptionally, vLLM can be installed from nightly:\npip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly\nand the vLLM server can be started with the following command (modify --tensor-parallel-size 8 to match your environment):\nvllm serve ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-20b --host 0.0.0.0 --port 8888  --tensor-parallel-size 8\n\n\nSGLang\nSGLang has 0-day support in main, see https://github.com/sgl-project/sglang/issues/8833 for infomation on installing\nSGLang from source. Once you’ve installed SGLang, run the following command to launch a SGLang server:\npython3 -m sglang.launch_server --model ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-120b --host 0.0.0.0 --port 8888 --tp 8\n\n\n\nTool use\nGPT-OSS has a comprehensive tool understanding. Axolotl supports tool calling datasets for Supervised Fine-tuning.\nHere is an example dataset config:\ndatasets:\n  - path: Nanobit/text-tools-2k-test\n    type: chat_template\nSee Nanobit/text-tools-2k-test for the sample dataset.\nRefer to our docs for more info.\n\n\nThinking and chat_template masking conflict\nOpenAI’s Harmony template hides thinking in all non-final turns, which conflicts with Axolotl’s chat_template masking.\nIf your dataset has thinking content mid-turn, there are two paths we recommend:\n\nTrain only on the last turn. This can be accomplished via chat_template’s train on last doc.\nAdjust your dataset to only have thinking content in the last turn.\n\n\n\nTIPS\n\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "GPT-OSS"
-    ]
-  },
-  {
-    "objectID": "docs/models/gpt-oss.html#optimization-guides",
-    "href": "docs/models/gpt-oss.html#optimization-guides",
-    "title": "GPT-OSS",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "GPT-OSS"
-    ]
-  },
-  {
-    "objectID": "docs/models/gpt-oss.html#related-resources",
-    "href": "docs/models/gpt-oss.html#related-resources",
-    "title": "GPT-OSS",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nGPT-OSS Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "GPT-OSS"
-    ]
-  },
-  {
-    "objectID": "docs/models/phi.html",
-    "href": "docs/models/phi.html",
-    "title": "Phi",
-    "section": "",
-    "text": "Due to some nuances with the phi code, please use deepspeed when training phi for full finetune.\naccelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed_configs/zero1.json\n\n# OR\n\npython -m axolotl.cli.train examples/phi/phi-qlora.yml",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Phi"
-    ]
-  },
-  {
-    "objectID": "docs/models/olmo3.html",
-    "href": "docs/models/olmo3.html",
-    "title": "OLMo 3",
-    "section": "",
-    "text": "Olmo 3 are a family of 7B and 32B models open source models trained by The Allen Institute for Artificial Intelligence.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "OLMo 3"
-    ]
-  },
-  {
-    "objectID": "docs/models/olmo3.html#getting-started",
-    "href": "docs/models/olmo3.html#getting-started",
-    "title": "OLMo 3",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/olmo3/olmo3-7b-qlora.yaml\n\nThis uses about 11.3 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nThe example config can be re-used for Olmo and Olmo 2.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "OLMo 3"
-    ]
-  },
-  {
-    "objectID": "docs/models/olmo3.html#optimization-guides",
-    "href": "docs/models/olmo3.html#optimization-guides",
-    "title": "OLMo 3",
+    "objectID": "docs/models/plano.html#optimization-guides",
+    "href": "docs/models/plano.html#optimization-guides",
+    "title": "Plano Orchestrator",
     "section": "Optimization Guides",
     "text": "Optimization Guides\nPlease check the Optimizations doc.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "OLMo 3"
+      "Plano Orchestrator"
     ]
   },
   {
-    "objectID": "docs/models/olmo3.html#related-resources",
-    "href": "docs/models/olmo3.html#related-resources",
-    "title": "OLMo 3",
+    "objectID": "docs/models/plano.html#related-resources",
+    "href": "docs/models/plano.html#related-resources",
+    "title": "Plano Orchestrator",
     "section": "Related Resources",
-    "text": "Related Resources\n\nOlmo 3 Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "text": "Related Resources\n\nPlano GitHub\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "OLMo 3"
+      "Plano Orchestrator"
     ]
   },
   {
-    "objectID": "docs/models/granite4.html",
-    "href": "docs/models/granite4.html",
-    "title": "Granite 4",
+    "objectID": "docs/models/ministral3/vision.html",
+    "href": "docs/models/ministral3/vision.html",
+    "title": "Ministral 3 Vision",
     "section": "",
-    "text": "Granite 4.0 are a family of open source models trained by IBM Research.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "text": "This guide covers fine-tuning Ministral3 2512 with vision capabilities using Axolotl.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Granite 4"
+      "Ministral3",
+      "Ministral 3 Vision"
     ]
   },
   {
-    "objectID": "docs/models/granite4.html#getting-started",
-    "href": "docs/models/granite4.html#getting-started",
-    "title": "Granite 4",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as Granite4 is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.7.1 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/granite4/granite-4.0-tiny-fft.yaml\nThis config uses about 40.8GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\n\n\n\nLimitation\nAdapter finetuning does not work at the moment. It would error with\nRuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x3072 and 1x1179648)\nIn addition, if adapter training works, lora_target_linear: true will not work due to:\nValueError: Target module GraniteMoeHybridParallelExperts() is not supported.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Granite 4"
-    ]
-  },
-  {
-    "objectID": "docs/models/granite4.html#optimization-guides",
-    "href": "docs/models/granite4.html#optimization-guides",
-    "title": "Granite 4",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Granite 4"
-    ]
-  },
-  {
-    "objectID": "docs/models/granite4.html#related-resources",
-    "href": "docs/models/granite4.html#related-resources",
-    "title": "Granite 4",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nGranite Docs\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Granite 4"
-    ]
-  },
-  {
-    "objectID": "docs/models/seed-oss.html",
-    "href": "docs/models/seed-oss.html",
-    "title": "Seed-OSS",
-    "section": "",
-    "text": "Seed-OSS are a series of 36B parameter open source models trained by ByteDance’s Seed Team.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Seed-OSS"
-    ]
-  },
-  {
-    "objectID": "docs/models/seed-oss.html#getting-started",
-    "href": "docs/models/seed-oss.html#getting-started",
-    "title": "Seed-OSS",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\npip3 install packaging setuptools wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\n# Install Cut Cross Entropy\npython scripts/cutcrossentropy_install.py | sh\nRun the finetuning example:\n\naxolotl train examples/seed-oss/seed-oss-36b-qlora.yaml\nThis config uses about 27.7 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official Seed Team recommends top_p=0.95 and temperature=1.1.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Seed-OSS"
-    ]
-  },
-  {
-    "objectID": "docs/models/seed-oss.html#optimization-guides",
-    "href": "docs/models/seed-oss.html#optimization-guides",
-    "title": "Seed-OSS",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\nPlease check the Optimizations doc.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Seed-OSS"
-    ]
-  },
-  {
-    "objectID": "docs/models/seed-oss.html#related-resources",
-    "href": "docs/models/seed-oss.html#related-resources",
-    "title": "Seed-OSS",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nByteDance Seed Website\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Seed-OSS"
-    ]
-  },
-  {
-    "objectID": "docs/models/qwen3.html",
-    "href": "docs/models/qwen3.html",
-    "title": "Qwen 3",
-    "section": "",
-    "text": "Qwen3 are a family of open source models trained by Alibaba.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Qwen 3"
-    ]
-  },
-  {
-    "objectID": "docs/models/qwen3.html#getting-started",
-    "href": "docs/models/qwen3.html#getting-started",
-    "title": "Qwen 3",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/qwen3/32b-qlora.yaml\n\nLet us know how it goes. Happy finetuning! 🚀\n\nChat template masking a few tokens off\nIf you notice that the chat_template masking for assistant prompts are off by a few tokens, please ensure that you are adding the below to the yaml.\nchat_template: qwen3\n\n\nTIPS\n\nFor inference, please check the official model card as it depends on your reasoning mode.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Qwen 3"
-    ]
-  },
-  {
-    "objectID": "docs/models/qwen3.html#optimization-guides",
-    "href": "docs/models/qwen3.html#optimization-guides",
-    "title": "Qwen 3",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\nPlease check the Optimizations doc.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Qwen 3"
-    ]
-  },
-  {
-    "objectID": "docs/models/qwen3.html#related-resources",
-    "href": "docs/models/qwen3.html#related-resources",
-    "title": "Qwen 3",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nQwen3 Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Qwen 3"
-    ]
-  },
-  {
-    "objectID": "docs/models/orpheus.html",
-    "href": "docs/models/orpheus.html",
-    "title": "Orpheus",
-    "section": "",
-    "text": "In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio.\nThe finetune.yml withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Orpheus"
-    ]
-  },
-  {
-    "objectID": "docs/models/orpheus.html#dataset-pre-processing-for-pre-training",
-    "href": "docs/models/orpheus.html#dataset-pre-processing-for-pre-training",
-    "title": "Orpheus",
-    "section": "Dataset pre-processing for pre-training",
-    "text": "Dataset pre-processing for pre-training\nIf you are adding another voice in English, please jump ahead to finetuning pre-processing.\nFor this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer.\nUsing this code, it will download the SNAC model and add the correct tokens and upload the final dataset.\nimport torch\nfrom snac import SNAC\nfrom datasets import load_dataset\nfrom huggingface_hub import snapshot_download\nfrom datasets import load_dataset\nimport random\nimport torchaudio.transforms as T\nfrom transformers import AutoTokenizer\nimport os\n\nmy_original_dataset_name = \"&lt;huggingface-id-of-dataset-that-we-want-to-preprocess&gt;\"\nname_to_push_dataset_to = \"&lt;huggingface-id-of-where-to-save-dataset&gt;\"\n\ndsn = my_original_dataset_name\n\nsnapshot_download(\n    repo_id=dsn,\n    repo_type=\"dataset\",\n    revision=\"main\",\n    max_workers=64,\n)\n\n\nds = load_dataset(dsn, split=\"train\")\nds_sample_rate = ds[0][\"audio\"][\"sampling_rate\"]\n\nmodel = SNAC.from_pretrained(\"hubertsiuzdak/snac_24khz\")\nmodel = model.to(\"mps\")\n\ndef tokenise_audio(waveform):\n  waveform = torch.from_numpy(waveform).unsqueeze(0)\n  waveform = waveform.to(dtype=torch.float32)\n  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)\n  waveform = resample_transform(waveform)\n\n  waveform = waveform.unsqueeze(0).to(\"cuda\")\n\n  #generate the codes from snac\n  with torch.inference_mode():\n    codes = model.encode(waveform)\n\n  all_codes = []\n  for i in range(codes[0].shape[1]):\n    all_codes.append(codes[0][0][i].item()+128266)\n    all_codes.append(codes[1][0][2*i].item()+128266+4096)\n    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))\n    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))\n    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))\n    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))\n    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))\n\n\n  return all_codes\n\ndef add_codes(example):\n    # Always initialize codes_list to None\n    codes_list = None\n\n    try:\n        answer_audio = example.get(\"audio\")\n        # If there's a valid audio array, tokenise it\n        if answer_audio and \"array\" in answer_audio:\n            audio_array = answer_audio[\"array\"]\n            codes_list = tokenise_audio(audio_array)\n    except Exception as e:\n        print(f\"Skipping row due to error: {e}\")\n        # Keep codes_list as None if we fail\n    example[\"codes_list\"] = codes_list\n\n    return example\n\nds = ds.map(add_codes, remove_columns=[\"audio\"])\n\n#@title Load Tokenizer\ntokeniser_length = 128256\nstart_of_text = 128000\nend_of_text = 128009\n\nstart_of_speech = tokeniser_length + 1\nend_of_speech = tokeniser_length + 2\n\nstart_of_human = tokeniser_length + 3\nend_of_human = tokeniser_length + 4\n\nstart_of_ai = tokeniser_length + 5\nend_of_ai =  tokeniser_length + 6\npad_token = tokeniser_length + 7\n\naudio_tokens_start = tokeniser_length + 10\n\ntokenizer_name = \"canopylabs/orpheus-3b-0.1-pretrained\"\n\n\ntokenizer = AutoTokenizer.from_pretrained(tokenizer_name)\nnum_proc = os.cpu_count() - 2\n\nds = ds.filter(lambda x: x[\"codes_list\"] is not None)\nds = ds.filter(lambda x: len(x[\"codes_list\"]) &gt; 0)\n\n#@title Create Input Ids\ndef remove_duplicate_frames(example):\n    vals = example[\"codes_list\"]\n    if len(vals) % 7 != 0:\n        raise ValueError(\"Input list length must be divisible by 7\")\n\n    result = vals[:7]\n\n    removed_frames = 0\n\n    for i in range(7, len(vals), 7):\n        current_first = vals[i]\n        previous_first = result[-7]\n\n        if current_first != previous_first:\n            result.extend(vals[i:i+7])\n        else:\n            removed_frames += 1\n\n    example[\"codes_list\"] = result\n\n    return example\n\nds = ds.map(remove_duplicate_frames, num_proc=num_proc)\n\n\ndef create_input_ids(example):\n    text_ids = tokenizer.encode({example['text']},  add_special_tokens=True)\n    text_ids.append(end_of_text)\n    example[\"text_tokens\"] = text_ids\n    input_ids = (\n        [start_of_human]\n        + example[\"text_tokens\"]\n        + [end_of_human]\n        + [start_of_ai]\n        + [start_of_speech]\n        + example[\"codes_list\"]\n        + [end_of_speech]\n        + [end_of_ai]\n    )\n    example[\"input_ids\"] = input_ids\n    example[\"labels\"] = input_ids\n    example[\"attention_mask\"] = [1] * len(input_ids)\n\n    return example\n\nds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=[\"text\", \"codes_list\"])\n\n#@title Remove unnecessary columns\ncolumns_to_keep = [\"input_ids\", \"labels\", \"attention_mask\"]\ncolumns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]\n\nds = ds.remove_columns(columns_to_remove)\n\nds.push_to_hub(name_to_push_dataset_to)",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Orpheus"
-    ]
-  },
-  {
-    "objectID": "docs/models/orpheus.html#finetune-pre-processing",
-    "href": "docs/models/orpheus.html#finetune-pre-processing",
-    "title": "Orpheus",
-    "section": "Finetune pre-processing",
-    "text": "Finetune pre-processing\nUse this code to add a new voice.\nimport torch\nfrom snac import SNAC\nfrom datasets import load_dataset\nfrom huggingface_hub import snapshot_download\nfrom datasets import load_dataset\nimport random\nimport torchaudio.transforms as T\nfrom transformers import AutoTokenizer\nimport os\n\nmy_original_dataset_name = \"&lt;huggingface-id-of-dataset-that-we-want-to-preprocess&gt;\"\nname_to_push_dataset_to = \"&lt;huggingface-id-of-where-to-save-dataset&gt;\"\n\ndsn = my_original_dataset_name\n\nsnapshot_download(\n    repo_id=dsn,\n    repo_type=\"dataset\",\n    revision=\"main\",\n    max_workers=64,\n)\n\n\nds = load_dataset(dsn, split=\"train\")\nds_sample_rate = ds[0][\"audio\"][\"sampling_rate\"]\n\nmodel = SNAC.from_pretrained(\"hubertsiuzdak/snac_24khz\")\nmodel = model.to(\"mps\")\n\ndef tokenise_audio(waveform):\n  waveform = torch.from_numpy(waveform).unsqueeze(0)\n  waveform = waveform.to(dtype=torch.float32)\n  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)\n  waveform = resample_transform(waveform)\n\n  waveform = waveform.unsqueeze(0).to(\"cuda\")\n\n  #generate the codes from snac\n  with torch.inference_mode():\n    codes = model.encode(waveform)\n\n  all_codes = []\n  for i in range(codes[0].shape[1]):\n    all_codes.append(codes[0][0][i].item()+128266)\n    all_codes.append(codes[1][0][2*i].item()+128266+4096)\n    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))\n    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))\n    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))\n    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))\n    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))\n\n\n  return all_codes\n\ndef add_codes(example):\n    # Always initialize codes_list to None\n    codes_list = None\n\n    try:\n        answer_audio = example.get(\"audio\")\n        # If there's a valid audio array, tokenise it\n        if answer_audio and \"array\" in answer_audio:\n            audio_array = answer_audio[\"array\"]\n            codes_list = tokenise_audio(audio_array)\n    except Exception as e:\n        print(f\"Skipping row due to error: {e}\")\n        # Keep codes_list as None if we fail\n    example[\"codes_list\"] = codes_list\n\n    return example\n\nds = ds.map(add_codes, remove_columns=[\"audio\"])\n\n#@title Load Tokenizer\ntokeniser_length = 128256\nstart_of_text = 128000\nend_of_text = 128009\n\nstart_of_speech = tokeniser_length + 1\nend_of_speech = tokeniser_length + 2\n\nstart_of_human = tokeniser_length + 3\nend_of_human = tokeniser_length + 4\n\nstart_of_ai = tokeniser_length + 5\nend_of_ai =  tokeniser_length + 6\npad_token = tokeniser_length + 7\n\naudio_tokens_start = tokeniser_length + 10\n\ntokenizer_name = \"canopylabs/orpheus-3b-0.1-pretrained\"\n\n\ntokenizer = AutoTokenizer.from_pretrained(tokenizer_name)\nnum_proc = os.cpu_count() - 2\n\nds = ds.filter(lambda x: x[\"codes_list\"] is not None)\nds = ds.filter(lambda x: len(x[\"codes_list\"]) &gt; 0)\n\n#@title Create Input Ids\ndef remove_duplicate_frames(example):\n    vals = example[\"codes_list\"]\n    if len(vals) % 7 != 0:\n        raise ValueError(\"Input list length must be divisible by 7\")\n\n    result = vals[:7]\n\n    removed_frames = 0\n\n    for i in range(7, len(vals), 7):\n        current_first = vals[i]\n        previous_first = result[-7]\n\n        if current_first != previous_first:\n            result.extend(vals[i:i+7])\n        else:\n            removed_frames += 1\n\n    example[\"codes_list\"] = result\n\n    return example\n\nds = ds.map(remove_duplicate_frames, num_proc=num_proc)\n\ntok_info = '''*** HERE you can modify the text prompt\ni.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:\nf\"{example[\"source\"]}:  {example[\"text\"]}\", as is passed.\n'''\nprint(tok_info)\n\ndef create_input_ids(example):\n    text_ids = tokenizer.encode(f\"{example['speaker_id']}: {example['text']}\",  add_special_tokens=True)\n    text_ids.append(end_of_text)\n    example[\"text_tokens\"] = text_ids\n    input_ids = (\n        [start_of_human]\n        + example[\"text_tokens\"]\n        + [end_of_human]\n        + [start_of_ai]\n        + [start_of_speech]\n        + example[\"codes_list\"]\n        + [end_of_speech]\n        + [end_of_ai]\n    )\n    example[\"input_ids\"] = input_ids\n    example[\"labels\"] = input_ids\n    example[\"attention_mask\"] = [1] * len(input_ids)\n\n    return example\n\nds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=[\"text\", \"codes_list\"])\n\n#@title Remove unnecessary columns\ncolumns_to_keep = [\"input_ids\", \"labels\", \"attention_mask\"]\ncolumns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]\n\nds = ds.remove_columns(columns_to_remove)\n\nds.push_to_hub(name_to_push_dataset_to)",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Orpheus"
-    ]
-  },
-  {
-    "objectID": "docs/models/orpheus.html#training",
-    "href": "docs/models/orpheus.html#training",
-    "title": "Orpheus",
-    "section": "Training",
-    "text": "Training\nAfter preprocessing is done, fill out the blanks in finetune.yml and simply run axolotl train finetune.yml",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Orpheus"
-    ]
-  },
-  {
-    "objectID": "docs/models/orpheus.html#inference",
-    "href": "docs/models/orpheus.html#inference",
-    "title": "Orpheus",
-    "section": "Inference",
-    "text": "Inference\nFor inference, please refer to the original orpheus github.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Orpheus"
-    ]
-  },
-  {
-    "objectID": "docs/models/hunyuan.html",
-    "href": "docs/models/hunyuan.html",
-    "title": "Hunyuan",
-    "section": "",
-    "text": "Tencent released a family of opensource models called HunYuan with varying parameter scales of 0.5B, 1.8B, 4B, and 7B scale for both Pre-trained and Instruct variants. The models can be found at HuggingFace. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Hunyuan"
-    ]
-  },
-  {
-    "objectID": "docs/models/hunyuan.html#getting-started",
-    "href": "docs/models/hunyuan.html#getting-started",
-    "title": "Hunyuan",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as HunYuan is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/hunyuan/hunyuan-v1-dense-qlora.yaml\nThis config uses about 4.7 GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nDataset\nHunYuan Instruct models can choose to enter a slow think or fast think pattern. For best performance on fine-tuning their Instruct models, your dataset should be adjusted to match their pattern.\n# fast think pattern\nmessages = [\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"/no_think What color is the sun?\" },\n    {\"role\": \"assistant\", \"content\": \"&lt;think&gt;\\n\\n&lt;/think&gt;\\n&lt;answer&gt;\\nThe sun is yellow.\\n&lt;/answer&gt;\"}\n]\n\n# slow think pattern\nmessages = [\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"/no_think What color is the sun?\" },\n    {\"role\": \"assistant\", \"content\": \"&lt;think&gt;\\nThe user is asking about the color of the sun. I need to ...\\n&lt;/think&gt;\\n&lt;answer&gt;\\nThe sun is yellow.\\n&lt;/answer&gt;\"}\n]\n\n\nTIPS\n\nFor inference, the official Tencent team recommends\n\n\n{\n  \"do_sample\": true,\n  \"top_k\": 20,\n  \"top_p\": 0.8,\n  \"repetition_penalty\": 1.05,\n  \"temperature\": 0.7\n}\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Hunyuan"
-    ]
-  },
-  {
-    "objectID": "docs/models/hunyuan.html#optimization-guides",
-    "href": "docs/models/hunyuan.html#optimization-guides",
-    "title": "Hunyuan",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Hunyuan"
-    ]
-  },
-  {
-    "objectID": "docs/models/hunyuan.html#related-resources",
-    "href": "docs/models/hunyuan.html#related-resources",
-    "title": "Hunyuan",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nTencent HunYuan Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Hunyuan"
-    ]
-  },
-  {
-    "objectID": "docs/models/mistral.html",
-    "href": "docs/models/mistral.html",
-    "title": "Mistral 7B",
-    "section": "",
-    "text": "Mistral 7B is a language model with a total of 7.3 billion parameters, showcasing a notable performance across a variety of benchmarks.\nFine Tune:\naccelerate launch -m axolotl.cli.train examples/mistral/config.yml\n\nIf you run into CUDA OOM, use deepspeed with config zero2.json:\naccelerate launch -m axolotl.cli.train examples/mistral/config.yml --deepspeed deepspeed_configs/zero2.json",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Mistral 7B"
-    ]
-  },
-  {
-    "objectID": "docs/models/mistral-small.html",
-    "href": "docs/models/mistral-small.html",
-    "title": "Mistral Small 3.1/3.2",
-    "section": "",
-    "text": "This guide covers fine-tuning Mistral Small 3.1 and Mistral Small 3.2 with vision capabilities using Axolotl.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Mistral Small 3.1/3.2"
-    ]
-  },
-  {
-    "objectID": "docs/models/mistral-small.html#prerequisites",
-    "href": "docs/models/mistral-small.html#prerequisites",
-    "title": "Mistral Small 3.1/3.2",
-    "section": "Prerequisites",
-    "text": "Prerequisites\nBefore starting, ensure you have:\n\nInstalled Axolotl (see Installation docs)",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Mistral Small 3.1/3.2"
-    ]
-  },
-  {
-    "objectID": "docs/models/mistral-small.html#getting-started",
-    "href": "docs/models/mistral-small.html#getting-started",
-    "title": "Mistral Small 3.1/3.2",
-    "section": "Getting Started",
-    "text": "Getting Started\n\nInstall the required vision lib:\nbash     pip install 'mistral-common[opencv]==1.8.5'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml\n\nThis config uses about 29.4 GiB VRAM.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Mistral Small 3.1/3.2"
-    ]
-  },
-  {
-    "objectID": "docs/models/mistral-small.html#dataset-format",
-    "href": "docs/models/mistral-small.html#dataset-format",
-    "title": "Mistral Small 3.1/3.2",
-    "section": "Dataset Format",
-    "text": "Dataset Format\nThe vision model requires multi-modal dataset format as documented here.\nOne exception is that, passing \"image\": PIL.Image is not supported. MistralTokenizer only supports path, url, and base64 for now.\nExample:\n{\n    \"messages\": [\n        {\"role\": \"system\", \"content\": [{ \"type\": \"text\", \"text\": \"{SYSTEM_PROMPT}\"}]},\n        {\"role\": \"user\", \"content\": [\n            { \"type\": \"text\", \"text\": \"What's in this image?\"},\n            {\"type\": \"image\", \"path\": \"path/to/image.jpg\" }\n        ]},\n        {\"role\": \"assistant\", \"content\": [{ \"type\": \"text\", \"text\": \"...\" }]},\n    ],\n}",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Mistral Small 3.1/3.2"
-    ]
-  },
-  {
-    "objectID": "docs/models/mistral-small.html#limitations",
-    "href": "docs/models/mistral-small.html#limitations",
-    "title": "Mistral Small 3.1/3.2",
-    "section": "Limitations",
-    "text": "Limitations\n\nSample Packing is not supported for multi-modality training currently.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Mistral Small 3.1/3.2"
-    ]
-  },
-  {
-    "objectID": "docs/models/smolvlm2.html",
-    "href": "docs/models/smolvlm2.html",
-    "title": "SmolVLM 2",
-    "section": "",
-    "text": "SmolVLM2 are a family of lightweight, open-source multimodal models from HuggingFace designed to analyze and understand video, image, and text content.\nThese models are built for efficiency, making them well-suited for on-device applications where computational resources are limited. Models are available in multiple sizes, including 2.2B, 500M, and 256M.\nThis guide shows how to fine-tune SmolVLM2 models with Axolotl.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "SmolVLM 2"
-    ]
-  },
-  {
-    "objectID": "docs/models/smolvlm2.html#getting-started",
-    "href": "docs/models/smolvlm2.html#getting-started",
-    "title": "SmolVLM 2",
-    "section": "Getting Started",
-    "text": "Getting Started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\npip3 install packaging setuptools wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\nInstall an extra dependency:\npip3 install num2words==0.5.14\nRun the finetuning example:\n# LoRA SFT (1x48GB @ 6.8GiB)\naxolotl train examples/smolvlm2/smolvlm2-2B-lora.yaml",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "SmolVLM 2"
-    ]
-  },
-  {
-    "objectID": "docs/models/smolvlm2.html#tips",
-    "href": "docs/models/smolvlm2.html#tips",
-    "title": "SmolVLM 2",
-    "section": "TIPS",
-    "text": "TIPS\n\nDataset Format: For video finetuning, your dataset must be compatible with the multi-content Messages format. For more details, see our documentation on Multimodal Formats.\nDataset Loading: Read more on how to prepare and load your own datasets in our documentation.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "SmolVLM 2"
-    ]
-  },
-  {
-    "objectID": "docs/models/smolvlm2.html#optimization-guides",
-    "href": "docs/models/smolvlm2.html#optimization-guides",
-    "title": "SmolVLM 2",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\nPlease check the Optimizations doc.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "SmolVLM 2"
-    ]
-  },
-  {
-    "objectID": "docs/models/smolvlm2.html#related-resources",
-    "href": "docs/models/smolvlm2.html#related-resources",
-    "title": "SmolVLM 2",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nSmolVLM2 Blog\nAxolotl Docs\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "SmolVLM 2"
-    ]
-  },
-  {
-    "objectID": "docs/models/llama-2.html",
-    "href": "docs/models/llama-2.html",
-    "title": "Llama 2",
-    "section": "",
-    "text": "This is an example of a llama-2 configuration for 7b and 13b. The yaml file contains configuration for the 7b variant, but you can just aswell use the same settings for 13b.\nThe 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.\nThe 13b variant will fit if you change these settings to these values:\ngradient_accumulation_steps: 2\nmicro_batch_size: 1\naccelerate launch -m axolotl.cli.train examples/llama-2/qlora.yml\nor\naccelerate launch -m axolotl.cli.train examples/llama-2/lora.yml\nTo launch a full finetuning with 16-bit precision:\naccelerate launch -m axolotl.cli.train examples/llama-2/fft_optimized.yml",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Llama 2"
-    ]
-  },
-  {
-    "objectID": "docs/models/magistral/vision.html",
-    "href": "docs/models/magistral/vision.html",
-    "title": "Magistral Vision",
-    "section": "",
-    "text": "This guide covers fine-tuning Magistral Small 2509 with vision capabilities using Axolotl.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Magistral",
-      "Magistral Vision"
-    ]
-  },
-  {
-    "objectID": "docs/models/magistral/vision.html#prerequisites",
-    "href": "docs/models/magistral/vision.html#prerequisites",
-    "title": "Magistral Vision",
+    "objectID": "docs/models/ministral3/vision.html#prerequisites",
+    "href": "docs/models/ministral3/vision.html#prerequisites",
+    "title": "Ministral 3 Vision",
     "section": "Prerequisites",
     "text": "Prerequisites\nBefore starting, ensure you have:\n\nInstalled Axolotl from source (see main README)",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Magistral",
-      "Magistral Vision"
+      "Ministral3",
+      "Ministral 3 Vision"
     ]
   },
   {
-    "objectID": "docs/models/magistral/vision.html#getting-started",
-    "href": "docs/models/magistral/vision.html#getting-started",
-    "title": "Magistral Vision",
+    "objectID": "docs/models/ministral3/vision.html#getting-started",
+    "href": "docs/models/ministral3/vision.html#getting-started",
+    "title": "Ministral 3 Vision",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall the required vision lib:\nbash     pip install 'mistral-common[opencv]==1.8.5'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/magistral/vision/magistral-small-vision-24B-qlora.yml\n\nThis config uses about 17GiB VRAM.\nWARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTips\nKey differences from text-only model:\n- max_tokens: 131072 for inference\n- Multi-modal dataset format required\n- Sample packing not supported",
+    "text": "Getting started\n\nInstall the required vision lib:\nbash     pip install 'mistral-common[opencv]==1.8.6'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/ministral3/vision/ministral3-3b-vision-qlora.yml\n\nWARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTips\nKey differences from text-only model:\n- Multi-modal dataset format required\n- Sample packing not supported",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Magistral",
-      "Magistral Vision"
+      "Ministral3",
+      "Ministral 3 Vision"
     ]
   },
   {
-    "objectID": "docs/models/magistral/vision.html#dataset-format",
-    "href": "docs/models/magistral/vision.html#dataset-format",
-    "title": "Magistral Vision",
+    "objectID": "docs/models/ministral3/vision.html#dataset-format",
+    "href": "docs/models/ministral3/vision.html#dataset-format",
+    "title": "Ministral 3 Vision",
     "section": "Dataset Format",
     "text": "Dataset Format\nThe vision model requires multi-modal dataset format as documented here.\nOne exception is that, passing \"image\": PIL.Image is not supported. MistralTokenizer only supports path, url, and base64 for now.\nExample:\n{\n    \"messages\": [\n        {\"role\": \"system\", \"content\": [{ \"type\": \"text\", \"text\": \"{SYSTEM_PROMPT}\"}]},\n        {\"role\": \"user\", \"content\": [\n            { \"type\": \"text\", \"text\": \"What's in this image?\"},\n            {\"type\": \"image\", \"path\": \"path/to/image.jpg\" }\n        ]},\n        {\"role\": \"assistant\", \"content\": [{ \"type\": \"text\", \"text\": \"...\" }]},\n    ],\n}",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Magistral",
-      "Magistral Vision"
+      "Ministral3",
+      "Ministral 3 Vision"
     ]
   },
   {
-    "objectID": "docs/models/magistral/vision.html#limitations",
-    "href": "docs/models/magistral/vision.html#limitations",
-    "title": "Magistral Vision",
+    "objectID": "docs/models/ministral3/vision.html#limitations",
+    "href": "docs/models/ministral3/vision.html#limitations",
+    "title": "Ministral 3 Vision",
     "section": "Limitations",
     "text": "Limitations\n\nSample Packing is not supported for multi-modality training currently.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Magistral",
-      "Magistral Vision"
+      "Ministral3",
+      "Ministral 3 Vision"
     ]
   },
   {
-    "objectID": "docs/models/jamba.html",
-    "href": "docs/models/jamba.html",
-    "title": "Jamba",
+    "objectID": "docs/models/ministral3.html",
+    "href": "docs/models/ministral3.html",
+    "title": "Ministral3",
     "section": "",
-    "text": "✅ qlora w/ deepspeed Zero-2 needs at least 2x GPUs and\n\n35GiB VRAM per GPU w minimal context length\n56GiB VRAM per GPU (w multipack enabled)\n\n✅ qlora w/ deepspeed Zero-3 needs at least 2x GPUs and 67GiB VRAM (wtf?)\n✅ qlora single-gpu, ~51GiB VRAM\n✅ multipack\n✅ FSDP\n❓ 8-bit LoRA",
+    "text": "Ministral3 is a family of open-weight models from MistralAI found on HuggingFace. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.\nPlease see Thinking and Vision for their respective fine-tuning.\nThanks to the team at MistralAI for giving us early access to prepare for these releases.\nNote: This is still experimental given it is based on transformers v5 RC.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Jamba"
+      "Ministral3",
+      "Ministral3"
     ]
   },
   {
-    "objectID": "docs/models/mimo.html",
-    "href": "docs/models/mimo.html",
-    "title": "MiMo",
-    "section": "",
-    "text": "MiMo is a family of models trained from scratch for reasoning tasks, incorporating Multiple-Token Prediction (MTP) as an additional training objective for enhanced performance and faster inference. Pre-trained on ~25T tokens with a three-stage data mixture strategy and optimized reasoning pattern density.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "MiMo"
-    ]
-  },
-  {
-    "objectID": "docs/models/mimo.html#getting-started",
-    "href": "docs/models/mimo.html#getting-started",
-    "title": "MiMo",
+    "objectID": "docs/models/ministral3.html#getting-started",
+    "href": "docs/models/ministral3.html#getting-started",
+    "title": "Ministral3",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nRun the finetuning example:\naxolotl train examples/mimo/mimo-7b-qlora.yaml\n\nThis config uses about 17.2 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀\n\nTips\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "text": "Getting started\n\nInstall Axolotl from source following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nSwap to the Axolotl transformers v5 branch\ncp examples/ministral3/ministral3-3b-qlora.yaml ministral3-3b-qlora.yaml\n\ngit fetch\ngit checkout transformers-v5\n\n# Install packages for transformers v5\npip install -e .\nRun the fine-tuning:\naxolotl train ministral3-3b-qlora.yaml\n\nLet us know how it goes. Happy finetuning! 🚀\n\nTips\n\nWe recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo’s files titled SYSTEM_PROMPT.txt.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\n\n\n\nThinking\nMinistral3 2512 model supports thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.\n📚 See the Thinking fine-tuning guide →\n\n\nVision\nMinistral3 2512 model also supports vision capabilities.\n📚 See the Vision fine-tuning guide →",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "MiMo"
+      "Ministral3",
+      "Ministral3"
     ]
   },
   {
-    "objectID": "docs/models/mimo.html#optimization-guides",
-    "href": "docs/models/mimo.html#optimization-guides",
-    "title": "MiMo",
+    "objectID": "docs/models/ministral3.html#optimization-guides",
+    "href": "docs/models/ministral3.html#optimization-guides",
+    "title": "Ministral3",
     "section": "Optimization Guides",
     "text": "Optimization Guides\nPlease check the Optimizations doc.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "MiMo"
+      "Ministral3",
+      "Ministral3"
     ]
   },
   {
-    "objectID": "docs/models/mimo.html#limitations",
-    "href": "docs/models/mimo.html#limitations",
-    "title": "MiMo",
+    "objectID": "docs/models/ministral3.html#limitations",
+    "href": "docs/models/ministral3.html#limitations",
+    "title": "Ministral3",
     "section": "Limitations",
-    "text": "Limitations\nCut Cross Entropy (CCE): Currently not supported. We plan to include CCE support for MiMo in the near future.",
+    "text": "Limitations\nWe only support the mistral-common tokenizer for Supervised Fine-tuning at the moment and for type: chat_template only.\nIn addition, we do not support overriding tokens yet.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "MiMo"
+      "Ministral3",
+      "Ministral3"
     ]
   },
   {
-    "objectID": "docs/models/mimo.html#related-resources",
-    "href": "docs/models/mimo.html#related-resources",
-    "title": "MiMo",
+    "objectID": "docs/models/ministral3.html#related-resources",
+    "href": "docs/models/ministral3.html#related-resources",
+    "title": "Ministral3",
     "section": "Related Resources",
-    "text": "Related Resources\n\nMiMo Paper\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "text": "Related Resources\n\nMistralAI Mistral3 Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "MiMo"
+      "Ministral3",
+      "Ministral3"
     ]
   },
   {
-    "objectID": "docs/api/utils.schedulers.html",
-    "href": "docs/api/utils.schedulers.html",
-    "title": "utils.schedulers",
+    "objectID": "docs/models/ministral3.html#future-work",
+    "href": "docs/models/ministral3.html#future-work",
+    "title": "Ministral3",
+    "section": "Future Work",
+    "text": "Future Work\n\nAdd parity to Preference Tuning, RL, etc.\nAdd parity to other tokenizer configs like overriding tokens.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Ministral3",
+      "Ministral3"
+    ]
+  },
+  {
+    "objectID": "docs/models/devstral.html",
+    "href": "docs/models/devstral.html",
+    "title": "Devstral",
+    "section": "",
+    "text": "Devstral Small is a 24B parameter opensource model from MistralAI found on HuggingFace Devstral-Small-2505 and Devstral-Small-2507. Devstral-Small-2507 is the latest version of the model and has function calling support.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.\nThe model was fine-tuned ontop of Mistral-Small-3.1 without the vision layer and has a context of up to 128k tokens.\nThanks to the team at MistralAI for giving us early access to prepare for this release.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Devstral"
+    ]
+  },
+  {
+    "objectID": "docs/models/devstral.html#getting-started",
+    "href": "docs/models/devstral.html#getting-started",
+    "title": "Devstral",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nInstall Cut Cross Entropy to reduce training VRAM usage\n\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/devstral/devstral-small-qlora.yml\nThis config uses about 21GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\nLearn how to use function calling with Axolotl at docs.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Devstral"
+    ]
+  },
+  {
+    "objectID": "docs/models/devstral.html#optimization-guides",
+    "href": "docs/models/devstral.html#optimization-guides",
+    "title": "Devstral",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations\nCut Cross Entropy\nLiger Kernel",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Devstral"
+    ]
+  },
+  {
+    "objectID": "docs/models/devstral.html#limitations",
+    "href": "docs/models/devstral.html#limitations",
+    "title": "Devstral",
+    "section": "Limitations",
+    "text": "Limitations\nWe only support the mistral-common tokenizer for Supervised Fine-tuning at the moment and for type: chat_template only.\nIn addition, we do not support overriding tokens yet.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Devstral"
+    ]
+  },
+  {
+    "objectID": "docs/models/devstral.html#related-resources",
+    "href": "docs/models/devstral.html#related-resources",
+    "title": "Devstral",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nMistralAI Devstral Blog\nMistralAI Devstral 1.1 Blog\nAxolotl Docs\nAxolotl GitHub\nAxolotl Website\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Devstral"
+    ]
+  },
+  {
+    "objectID": "docs/models/devstral.html#future-work",
+    "href": "docs/models/devstral.html#future-work",
+    "title": "Devstral",
+    "section": "Future Work",
+    "text": "Future Work\n\nAdd parity to Preference Tuning, RL, Multi-modal, etc.\nAdd parity to other tokenizer configs like overriding tokens.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Devstral"
+    ]
+  },
+  {
+    "objectID": "docs/models/llama-4.html",
+    "href": "docs/models/llama-4.html",
+    "title": "Llama 4",
+    "section": "",
+    "text": "While Flash Attention to support is “enabled” for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Llama 4"
+    ]
+  },
+  {
+    "objectID": "docs/models/llama-4.html#flash-attention-vs-flex-attention",
+    "href": "docs/models/llama-4.html#flash-attention-vs-flex-attention",
+    "title": "Llama 4",
+    "section": "",
+    "text": "While Flash Attention to support is “enabled” for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Llama 4"
+    ]
+  },
+  {
+    "objectID": "docs/models/llama-4.html#available-examples",
+    "href": "docs/models/llama-4.html#available-examples",
+    "title": "Llama 4",
+    "section": "Available Examples",
+    "text": "Available Examples\n\nLlama 4 Scout 17Bx16Experts (109B)\nFlex Attention\n- Text Single GPU (H100) QLoRA\n- Text Multi GPU QLoRA w/ FSDP2\nOur Single H100 implementation for Llama 4 Scout uses only 64.5GB VRAM for post-training with 4k context length @ 519 tokens/second. WandB logs here\nMulti-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @ 280tps/gpu, WandB logs here\n\n\nLlama 4 Maverick 17Bx128Experts (400B)\nComing Soon",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Llama 4"
+    ]
+  },
+  {
+    "objectID": "docs/models/llama-4.html#delinearized-llama-4-models",
+    "href": "docs/models/llama-4.html#delinearized-llama-4-models",
+    "title": "Llama 4",
+    "section": "Delinearized Llama 4 Models",
+    "text": "Delinearized Llama 4 Models\nWe provide a script to delinearize Llama 4 linearized models into regular HuggingFace Llama 4 models.\naxolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir\nNote: This only works with the non-quantized linearized model. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Llama 4"
+    ]
+  },
+  {
+    "objectID": "docs/models/ministral.html",
+    "href": "docs/models/ministral.html",
+    "title": "Ministral",
+    "section": "",
+    "text": "Ministral is a family of openweight models from MistralAI found on HuggingFace. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Ministral"
+    ]
+  },
+  {
+    "objectID": "docs/models/ministral.html#getting-started",
+    "href": "docs/models/ministral.html#getting-started",
+    "title": "Ministral",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/ministral/ministral-small-qlora.yaml\n\nThis config uses about 8.76 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTips\n\nWe recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo’s files titled SYSTEM_PROMPT.txt.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Ministral"
+    ]
+  },
+  {
+    "objectID": "docs/models/ministral.html#optimization-guides",
+    "href": "docs/models/ministral.html#optimization-guides",
+    "title": "Ministral",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\nPlease check the Optimizations doc.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Ministral"
+    ]
+  },
+  {
+    "objectID": "docs/models/ministral.html#limitations",
+    "href": "docs/models/ministral.html#limitations",
+    "title": "Ministral",
+    "section": "Limitations",
+    "text": "Limitations\nWe only support the mistral-common tokenizer for Supervised Fine-tuning at the moment and for type: chat_template only.\nIn addition, we do not support overriding tokens yet.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Ministral"
+    ]
+  },
+  {
+    "objectID": "docs/models/ministral.html#related-resources",
+    "href": "docs/models/ministral.html#related-resources",
+    "title": "Ministral",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nMistralAI Ministral Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Ministral"
+    ]
+  },
+  {
+    "objectID": "docs/models/ministral.html#future-work",
+    "href": "docs/models/ministral.html#future-work",
+    "title": "Ministral",
+    "section": "Future Work",
+    "text": "Future Work\n\nAdd parity to Preference Tuning, RL, etc.\nAdd parity to other tokenizer configs like overriding tokens.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Ministral"
+    ]
+  },
+  {
+    "objectID": "docs/models/trinity.html",
+    "href": "docs/models/trinity.html",
+    "title": "Trinity",
+    "section": "",
+    "text": "Trinity is a family of open weight MoE models trained by Arcee.ai.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Trinity"
+    ]
+  },
+  {
+    "objectID": "docs/models/trinity.html#getting-started",
+    "href": "docs/models/trinity.html#getting-started",
+    "title": "Trinity",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the main from the installation guide.\nRun the finetuning example:\naxolotl train examples/trinity/trinity-nano-preview-qlora.yaml\n\nThis config uses about 24.9 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official Arcee.ai team recommends top_p: 0.75, temperature: 0.15, top_k: 50, and min_p: 0.06.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Trinity"
+    ]
+  },
+  {
+    "objectID": "docs/models/trinity.html#optimization-guides",
+    "href": "docs/models/trinity.html#optimization-guides",
+    "title": "Trinity",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\nPlease check the Optimizations doc.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Trinity"
+    ]
+  },
+  {
+    "objectID": "docs/models/trinity.html#limitations",
+    "href": "docs/models/trinity.html#limitations",
+    "title": "Trinity",
+    "section": "Limitations",
+    "text": "Limitations\nCut Cross Entropy (CCE): Currently not supported. We plan to include CCE support for Trinity in the near future.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Trinity"
+    ]
+  },
+  {
+    "objectID": "docs/models/trinity.html#related-resources",
+    "href": "docs/models/trinity.html#related-resources",
+    "title": "Trinity",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nTrinity Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Trinity"
+    ]
+  },
+  {
+    "objectID": "docs/models/voxtral.html",
+    "href": "docs/models/voxtral.html",
+    "title": "Voxtral",
+    "section": "",
+    "text": "Voxtral is a 3B/24B parameter opensource model from MistralAI found on HuggingFace. This guide shows how to fine-tune it with Axolotl.\nThanks to the team at MistralAI for giving us early access to prepare for this release.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Voxtral"
+    ]
+  },
+  {
+    "objectID": "docs/models/voxtral.html#getting-started",
+    "href": "docs/models/voxtral.html#getting-started",
+    "title": "Voxtral",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nPlease install the below.\n\n# audio\npip3 install librosa==0.11.0\npip3 install 'mistral_common[audio]==1.8.3'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nDownload sample dataset files\n\n# for text + audio only\nwget https://huggingface.co/datasets/Nanobit/text-audio-2k-test/resolve/main/En-us-African_elephant.oga\n\nRun the finetuning example:\n\n# text only\naxolotl train examples/voxtral/voxtral-mini-qlora.yml\n\n# text + audio\naxolotl train examples/voxtral/voxtral-mini-audio-qlora.yml\nThese configs use about 4.8 GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official MistralAI team recommends temperature: 0.2 and top_p: 0.95 for audio understanding and temperature: 0.0 for transcription.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\nThe multimodal dataset format follows the OpenAI multi-content Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Voxtral"
+    ]
+  },
+  {
+    "objectID": "docs/models/voxtral.html#optimization-guides",
+    "href": "docs/models/voxtral.html#optimization-guides",
+    "title": "Voxtral",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Voxtral"
+    ]
+  },
+  {
+    "objectID": "docs/models/voxtral.html#limitations",
+    "href": "docs/models/voxtral.html#limitations",
+    "title": "Voxtral",
+    "section": "Limitations",
+    "text": "Limitations\nWe only support the mistral-common tokenizer for Supervised Fine-tuning at the moment and for type: chat_template only.\nIn addition, we do not support overriding tokens yet.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Voxtral"
+    ]
+  },
+  {
+    "objectID": "docs/models/voxtral.html#related-resources",
+    "href": "docs/models/voxtral.html#related-resources",
+    "title": "Voxtral",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nMistralAI Magistral Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Voxtral"
+    ]
+  },
+  {
+    "objectID": "docs/models/voxtral.html#future-work",
+    "href": "docs/models/voxtral.html#future-work",
+    "title": "Voxtral",
+    "section": "Future Work",
+    "text": "Future Work\n\nAdd parity to Preference Tuning, RL, etc.\nAdd parity to other tokenizer configs like overriding tokens.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Voxtral"
+    ]
+  },
+  {
+    "objectID": "docs/models/magistral.html",
+    "href": "docs/models/magistral.html",
+    "title": "Magistral",
+    "section": "",
+    "text": "Magistral Small is a 24B parameter opensource model from MistralAI found on HuggingFace at 2506, 2507 (see Thinking), and 2509 (see Vision). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.\nMistralAI has also released a proprietary medium-sized version called Magistral Medium.\nThanks to the team at MistralAI for giving us early access to prepare for these releases.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Magistral",
+      "Magistral"
+    ]
+  },
+  {
+    "objectID": "docs/models/magistral.html#getting-started",
+    "href": "docs/models/magistral.html#getting-started",
+    "title": "Magistral",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.7.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nInstall Cut Cross Entropy to reduce training VRAM usage\n\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/magistral/magistral-small-qlora.yaml\nThis config uses about 24GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nThinking\nMistralAI has released their 2507 model with thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.\n📚 See the Thinking fine-tuning guide →\n\n\nVision\nMistralAI has released their 2509 model with vision capabilities.\n📚 See the Vision fine-tuning guide →\n\n\nTips\n\nWe recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo’s files titled SYSTEM_PROMPT.txt.\nFor inference, the official MistralAI team recommends top_p: 0.95 and temperature: 0.7 with max_tokens: 40960.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Magistral",
+      "Magistral"
+    ]
+  },
+  {
+    "objectID": "docs/models/magistral.html#optimization-guides",
+    "href": "docs/models/magistral.html#optimization-guides",
+    "title": "Magistral",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Magistral",
+      "Magistral"
+    ]
+  },
+  {
+    "objectID": "docs/models/magistral.html#limitations",
+    "href": "docs/models/magistral.html#limitations",
+    "title": "Magistral",
+    "section": "Limitations",
+    "text": "Limitations\nWe only support the mistral-common tokenizer for Supervised Fine-tuning at the moment and for type: chat_template only.\nIn addition, we do not support overriding tokens yet.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Magistral",
+      "Magistral"
+    ]
+  },
+  {
+    "objectID": "docs/models/magistral.html#related-resources",
+    "href": "docs/models/magistral.html#related-resources",
+    "title": "Magistral",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nMistralAI Magistral Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Magistral",
+      "Magistral"
+    ]
+  },
+  {
+    "objectID": "docs/models/magistral.html#future-work",
+    "href": "docs/models/magistral.html#future-work",
+    "title": "Magistral",
+    "section": "Future Work",
+    "text": "Future Work\n\nAdd parity to Preference Tuning, RL, etc.\nAdd parity to other tokenizer configs like overriding tokens.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Magistral",
+      "Magistral"
+    ]
+  },
+  {
+    "objectID": "docs/models/LiquidAI.html",
+    "href": "docs/models/LiquidAI.html",
+    "title": "Liquid Foundation Models 2",
+    "section": "",
+    "text": "Liquid Foundation Models 2 (LFM2) are a family of small, open-weight models from Liquid AI focused on quality, speed, and memory efficiency. Liquid AI released text-only LFM2 and text+vision LFM2-VL models.\nLFM2 features a new hybrid Liquid architecture with multiplicative gates, short-range convolutions, and grouped query attention, enabling fast training and inference.\nThis guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl.\nThanks to the team at LiquidAI for giving us early access to prepare for these releases.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Liquid Foundation Models 2"
+    ]
+  },
+  {
+    "objectID": "docs/models/LiquidAI.html#getting-started",
+    "href": "docs/models/LiquidAI.html#getting-started",
+    "title": "Liquid Foundation Models 2",
+    "section": "Getting Started",
+    "text": "Getting Started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\npip3 install packaging setuptools wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\nRun one of the finetuning examples below.\nLFM2\n# FFT SFT (1x48GB @ 25GiB)\naxolotl train examples/LiquidAI/lfm2-350m-fft.yaml\nLFM2-VL\n# LoRA SFT (1x48GB @ 2.7GiB)\naxolotl train examples/LiquidAI/lfm2-vl-lora.yaml\nLFM2-MoE\npip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6\n\n# LoRA SFT (1x48GB @ 16.2GiB)\naxolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml\n\n\nTIPS\n\nInstallation Error: If you encounter ImportError: ... undefined symbol ... or ModuleNotFoundError: No module named 'causal_conv1d_cuda', the causal-conv1d package may have been installed incorrectly. Try uninstalling it:\npip uninstall -y causal-conv1d\nDataset Loading: Read more on how to load your own dataset in our documentation.\nDataset Formats:\n\nFor LFM2 models, the dataset format follows the OpenAI Messages format as seen here.\nFor LFM2-VL models, Axolotl follows the multi-content Messages format. See our Multimodal docs for details.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Liquid Foundation Models 2"
+    ]
+  },
+  {
+    "objectID": "docs/models/LiquidAI.html#optimization-guides",
+    "href": "docs/models/LiquidAI.html#optimization-guides",
+    "title": "Liquid Foundation Models 2",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\n\nOptimizations Guide",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Liquid Foundation Models 2"
+    ]
+  },
+  {
+    "objectID": "docs/models/LiquidAI.html#related-resources",
+    "href": "docs/models/LiquidAI.html#related-resources",
+    "title": "Liquid Foundation Models 2",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nLFM2 Blog\nLFM2-VL Blog\nLFM2-MoE Blog\nAxolotl Docs\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Liquid Foundation Models 2"
+    ]
+  },
+  {
+    "objectID": "docs/models/arcee.html",
+    "href": "docs/models/arcee.html",
+    "title": "Arcee AFM",
+    "section": "",
+    "text": "Arcee Foundation Models (AFM) are a family of 4.5B parameter open weight models trained by Arcee.ai.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.\nThanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the AFM model.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Arcee AFM"
+    ]
+  },
+  {
+    "objectID": "docs/models/arcee.html#getting-started",
+    "href": "docs/models/arcee.html#getting-started",
+    "title": "Arcee AFM",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as AFM is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/arcee/afm-4.5b-qlora.yaml\nThis config uses about 7.8GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official Arcee.ai team recommends top_p: 0.95, temperature: 0.5, top_k: 50, and repeat_penalty: 1.1.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Arcee AFM"
+    ]
+  },
+  {
+    "objectID": "docs/models/arcee.html#optimization-guides",
+    "href": "docs/models/arcee.html#optimization-guides",
+    "title": "Arcee AFM",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Arcee AFM"
+    ]
+  },
+  {
+    "objectID": "docs/models/arcee.html#related-resources",
+    "href": "docs/models/arcee.html#related-resources",
+    "title": "Arcee AFM",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nAFM Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Arcee AFM"
+    ]
+  },
+  {
+    "objectID": "docs/models/internvl3_5.html",
+    "href": "docs/models/internvl3_5.html",
+    "title": "InternVL 3.5",
+    "section": "",
+    "text": "InternVL 3.5 is a family of powerful vision-language models supporting dynamic resolution and multi-image understanding by OpenGV. It features a ViT-style vision encoder and strong language model backbone for tasks like visual question answering, OCR, and scene text understanding.\nThis guide shows how to fine-tune it with Axolotl.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "InternVL 3.5"
+    ]
+  },
+  {
+    "objectID": "docs/models/internvl3_5.html#getting-started",
+    "href": "docs/models/internvl3_5.html#getting-started",
+    "title": "InternVL 3.5",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall timm for vision model support:\npip install timm==1.0.19\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/internvl3_5/internvl3_5-8b-qlora.yml\n\nThis config uses about 8.21 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀\n\nTips\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the multi-modal format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "InternVL 3.5"
+    ]
+  },
+  {
+    "objectID": "docs/models/internvl3_5.html#optimization-guides",
+    "href": "docs/models/internvl3_5.html#optimization-guides",
+    "title": "InternVL 3.5",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\nPlease check the Optimizations doc.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "InternVL 3.5"
+    ]
+  },
+  {
+    "objectID": "docs/models/internvl3_5.html#related-resources",
+    "href": "docs/models/internvl3_5.html#related-resources",
+    "title": "InternVL 3.5",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nInternVL Paper\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "InternVL 3.5"
+    ]
+  },
+  {
+    "objectID": "docs/models/kimi-linear.html",
+    "href": "docs/models/kimi-linear.html",
+    "title": "Kimi Linear",
+    "section": "",
+    "text": "Kimi Linear is a MoE model (48B total, 3B active) by MoonshotAI using a hybrid linear attention architecture to achieve a 1M token context length. It uses Kimi Delta Attention (KDA), a refined version of Gated DeltaNet that reduces KV cache size by up to 75% and boosts decoding throughput by up to 6x for long contexts.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.\nNote: Axolotl uses experimental training code for Kimi Linear as their original modeling code is inference-only.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Kimi Linear"
+    ]
+  },
+  {
+    "objectID": "docs/models/kimi-linear.html#getting-started",
+    "href": "docs/models/kimi-linear.html#getting-started",
+    "title": "Kimi Linear",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall CCE via docs\nRun the finetuning example:\naxolotl train examples/kimi-linear/kimi-48b-lora.yaml\n\nThis config uses about 98.7GiB VRAM.\nLet us know how it goes. Happy finetuning!\n\nTIPS\n\nKimi Linear requires trust_remote_code: true.\nYou can run a full finetuning by removing the adapter: lora and load_in_8bit: true.\nRead more on how to load your own dataset at docs\nThe dataset format follows the OpenAI Messages format as seen here",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Kimi Linear"
+    ]
+  },
+  {
+    "objectID": "docs/models/kimi-linear.html#optimization-guides",
+    "href": "docs/models/kimi-linear.html#optimization-guides",
+    "title": "Kimi Linear",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\nSee 👉 docs.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Kimi Linear"
+    ]
+  },
+  {
+    "objectID": "docs/models/kimi-linear.html#limitations",
+    "href": "docs/models/kimi-linear.html#limitations",
+    "title": "Kimi Linear",
+    "section": "Limitations",
+    "text": "Limitations\nThis is not yet compatible with MoE kernels from transformers v5.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Kimi Linear"
+    ]
+  },
+  {
+    "objectID": "docs/models/kimi-linear.html#related-resources",
+    "href": "docs/models/kimi-linear.html#related-resources",
+    "title": "Kimi Linear",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nKimi Linear Paper\nKimi Linear GitHub\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Kimi Linear"
+    ]
+  },
+  {
+    "objectID": "docs/models/magistral/think.html",
+    "href": "docs/models/magistral/think.html",
+    "title": "Magistral Thinking",
+    "section": "",
+    "text": "This guide covers fine-tuning Magistral Small 2507 with thinking capabilities using Axolotl. The thinking model enables explicit Chain-of-Thought reasoning with separate thinking and response sections.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Magistral",
+      "Magistral Thinking"
+    ]
+  },
+  {
+    "objectID": "docs/models/magistral/think.html#prerequisites",
+    "href": "docs/models/magistral/think.html#prerequisites",
+    "title": "Magistral Thinking",
+    "section": "Prerequisites",
+    "text": "Prerequisites\nBefore starting, ensure you have:\n\nInstalled Axolotl (see main README)",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Magistral",
+      "Magistral Thinking"
+    ]
+  },
+  {
+    "objectID": "docs/models/magistral/think.html#getting-started",
+    "href": "docs/models/magistral/think.html#getting-started",
+    "title": "Magistral Thinking",
+    "section": "Getting Started",
+    "text": "Getting Started\nRun the thinking model fine-tuning:\naxolotl train examples/magistral/think/magistral-small-think-qlora.yaml\nThis config uses about 19.1 GiB VRAM.\n\nTips\n\nDataset uses multi-content format with type: thinking support. See Dataset Format below.\nYou cannot mix content: str and content: list[dict], otherwise, dataset loading will fail. Keep it consistent.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Magistral",
+      "Magistral Thinking"
+    ]
+  },
+  {
+    "objectID": "docs/models/magistral/think.html#dataset-format",
+    "href": "docs/models/magistral/think.html#dataset-format",
+    "title": "Magistral Thinking",
+    "section": "Dataset Format",
+    "text": "Dataset Format\nThe thinking model requires the multi-content dataset format with support for an extra role: thinking within system and assistant messages.\nExample format:\n{\n    \"messages\": [\n        {\n            \"role\": \"system\",\n            \"content\": [\n                { \"type\": \"text\", \"text\": \"{SYSTEM_PROMPT}\"}\n            ]\n        },\n        {\n            \"role\": \"user\",\n            \"content\": [\n                { \"type\": \"text\", \"text\": \"Solve this step by step: What is 15% of 240?\"}\n            ]\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": [\n                {\n                    \"type\": \"thinking\",\n                    \"thinking\": \"I need to calculate 15% of 240. First, I'll convert 15% to decimal: 0.15. Then multiply: 0.15 × 240 = 36.\"\n                },\n                {\n                    \"type\": \"text\",\n                    \"text\": \"To find 15% of 240, I'll multiply 240 by 0.15:\\n\\n240 × 0.15 = 36\\n\\nTherefore, 15% of 240 is 36.\"\n                }\n            ]\n        }\n    ]\n}\n\nAdvanced Options\nThe thinking section supports an optional closed parameter:\n{\n    \"type\": \"thinking\",\n    \"thinking\": \"Internal reasoning here...\",\n    \"closed\": true  // Default: true, controls adding the closing [/THINK] tag\n}",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Magistral",
+      "Magistral Thinking"
+    ]
+  },
+  {
+    "objectID": "docs/models/index.html",
+    "href": "docs/models/index.html",
+    "title": "Model Guides",
+    "section": "",
+    "text": "Model Guides\nBelow are the curated examples for training various model architectures:\n\nKimi Linear\nPlano Orchestrator\nMiMo\nInternVL 3.5\nOLMo 3\nTrinity\nArcee AFM\nMinistral3\nMinistral 3 Thinking\nMinistral 3 Vision\nMagistral\nMagistral Thinking\nMagistral Vision\nMinistral\nMistral Small 3.1/3.2\nVoxtral\nDevstral\nMistral 7B\nLlama 4\nLlama 2\nQwen 3 Next\nQwen 3\nGemma 3n\nApertus\nGPT-OSS\nSeed-OSS\nPhi\nSmolVLM 2\nGranite 4\nLiquid Foundation Models 2\nHunyuan\nJamba\nOrpheus"
+  },
+  {
+    "objectID": "docs/models/qwen3-next.html",
+    "href": "docs/models/qwen3-next.html",
+    "title": "Qwen 3 Next",
+    "section": "",
+    "text": "Qwen3-Next represents the next-generation foundation models optimized for extreme context length and large-scale parameter efficiency. The series introduces architectural innovations including Hybrid Attention (Gated DeltaNet + Gated Attention), High-Sparsity MoE with 1:50 activation ratio, and Multi-Token Prediction for enhanced performance and inference acceleration.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Qwen 3 Next"
+    ]
+  },
+  {
+    "objectID": "docs/models/qwen3-next.html#getting-started",
+    "href": "docs/models/qwen3-next.html#getting-started",
+    "title": "Qwen 3 Next",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as Qwen3-Next is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nInstall Qwen3-Next transformers commit\n\npip3 uninstall -y transformers && pip3 install \"git+https://github.com/huggingface/transformers.git@b9282355bea846b54ed850a066901496b19da654\"\n\nInstall FLA for improved performance\n\npip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.3.2\n\nRun the finetuning example:\n\naxolotl train examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml\nThis config uses about 45.62 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, you can experiment with temperature: 0.7, top_p: 0.8, top_k: 20, and min_p: 0.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config. See Multi-GPU section below.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Qwen 3 Next"
+    ]
+  },
+  {
+    "objectID": "docs/models/qwen3-next.html#optimization-guides",
+    "href": "docs/models/qwen3-next.html#optimization-guides",
+    "title": "Qwen 3 Next",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Qwen 3 Next"
+    ]
+  },
+  {
+    "objectID": "docs/models/qwen3-next.html#related-resources",
+    "href": "docs/models/qwen3-next.html#related-resources",
+    "title": "Qwen 3 Next",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nQwen3-Next Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Qwen 3 Next"
+    ]
+  },
+  {
+    "objectID": "docs/models/gemma3n.html",
+    "href": "docs/models/gemma3n.html",
+    "title": "Gemma 3n",
+    "section": "",
+    "text": "Gemma-3n is a family of multimodal models from Google found on HuggingFace. This guide shows how to fine-tune it with Axolotl.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Gemma 3n"
+    ]
+  },
+  {
+    "objectID": "docs/models/gemma3n.html#getting-started",
+    "href": "docs/models/gemma3n.html#getting-started",
+    "title": "Gemma 3n",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nIn addition to Axolotl’s requirements, Gemma-3n requires:\n\npip3 install timm==1.0.17\n\n# for loading audio data\npip3 install librosa==0.11.0\n\nDownload sample dataset files\n\n# for text + vision + audio only\nwget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/African_elephant.jpg\nwget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/En-us-African_elephant.oga\n\nRun the finetuning example:\n\n# text only\naxolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml\n\n# text + vision\naxolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml\n\n# text + vision + audio\naxolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml\nLet us know how it goes. Happy finetuning! 🚀\nWARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTIPS\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\nThe multimodal dataset format follows the OpenAI multi-content Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Gemma 3n"
+    ]
+  },
+  {
+    "objectID": "docs/models/gemma3n.html#optimization-guides",
+    "href": "docs/models/gemma3n.html#optimization-guides",
+    "title": "Gemma 3n",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Gemma 3n"
+    ]
+  },
+  {
+    "objectID": "docs/models/gemma3n.html#related-resources",
+    "href": "docs/models/gemma3n.html#related-resources",
+    "title": "Gemma 3n",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nGemma 3n Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Gemma 3n"
+    ]
+  },
+  {
+    "objectID": "docs/api/core.chat.format.llama3x.html",
+    "href": "docs/api/core.chat.format.llama3x.html",
+    "title": "core.chat.format.llama3x",
+    "section": "",
+    "text": "core.chat.format.llama3x\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents"
+  },
+  {
+    "objectID": "docs/api/monkeypatch.unsloth_.html",
+    "href": "docs/api/monkeypatch.unsloth_.html",
+    "title": "monkeypatch.unsloth_",
+    "section": "",
+    "text": "monkeypatch.unsloth_\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations"
+  },
+  {
+    "objectID": "docs/api/integrations.kd.trainer.html",
+    "href": "docs/api/integrations.kd.trainer.html",
+    "title": "integrations.kd.trainer",
+    "section": "",
+    "text": "integrations.kd.trainer\nKD trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+  },
+  {
+    "objectID": "docs/api/integrations.kd.trainer.html#classes",
+    "href": "docs/api/integrations.kd.trainer.html#classes",
+    "title": "integrations.kd.trainer",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+  },
+  {
+    "objectID": "docs/api/integrations.grokfast.optimizer.html",
+    "href": "docs/api/integrations.grokfast.optimizer.html",
+    "title": "integrations.grokfast.optimizer",
+    "section": "",
+    "text": "integrations.grokfast.optimizer\nintegrations.grokfast.optimizer"
+  },
+  {
+    "objectID": "docs/api/core.chat.format.chatml.html",
+    "href": "docs/api/core.chat.format.chatml.html",
+    "title": "core.chat.format.chatml",
+    "section": "",
+    "text": "core.chat.format.chatml\ncore.chat.format.chatml\nChatML transformation functions for MessageContents"
+  },
+  {
+    "objectID": "docs/api/utils.callbacks.perplexity.html",
+    "href": "docs/api/utils.callbacks.perplexity.html",
+    "title": "utils.callbacks.perplexity",
+    "section": "",
+    "text": "utils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
+  },
+  {
+    "objectID": "docs/api/utils.callbacks.perplexity.html#classes",
+    "href": "docs/api/utils.callbacks.perplexity.html#classes",
+    "title": "utils.callbacks.perplexity",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
+  },
+  {
+    "objectID": "docs/api/utils.callbacks.profiler.html",
+    "href": "docs/api/utils.callbacks.profiler.html",
+    "title": "utils.callbacks.profiler",
+    "section": "",
+    "text": "utils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\n\n\n\nName\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
+  },
+  {
+    "objectID": "docs/api/utils.callbacks.profiler.html#classes",
+    "href": "docs/api/utils.callbacks.profiler.html#classes",
+    "title": "utils.callbacks.profiler",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
+  },
+  {
+    "objectID": "docs/api/cli.preprocess.html",
+    "href": "docs/api/cli.preprocess.html",
+    "title": "cli.preprocess",
+    "section": "",
+    "text": "cli.preprocess\nCLI to run preprocessing of a dataset.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
+  },
+  {
+    "objectID": "docs/api/cli.preprocess.html#functions",
+    "href": "docs/api/cli.preprocess.html#functions",
+    "title": "cli.preprocess",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
+  },
+  {
+    "objectID": "docs/api/cli.utils.load.html",
+    "href": "docs/api/cli.utils.load.html",
+    "title": "cli.utils.load",
+    "section": "",
+    "text": "cli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
+  },
+  {
+    "objectID": "docs/api/cli.utils.load.html#functions",
+    "href": "docs/api/cli.utils.load.html#functions",
+    "title": "cli.utils.load",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
+  },
+  {
+    "objectID": "docs/api/cli.inference.html",
+    "href": "docs/api/cli.inference.html",
+    "title": "cli.inference",
+    "section": "",
+    "text": "cli.inference\nCLI to run inference on a trained model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat\ntemplate is (optionally) applied, and the model specified in the axolotl config is\nused to generate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
+  },
+  {
+    "objectID": "docs/api/cli.inference.html#functions",
+    "href": "docs/api/cli.inference.html#functions",
+    "title": "cli.inference",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat\ntemplate is (optionally) applied, and the model specified in the axolotl config is\nused to generate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
+  },
+  {
+    "objectID": "docs/api/cli.args.html",
+    "href": "docs/api/cli.args.html",
+    "title": "cli.args",
+    "section": "",
+    "text": "cli.args\nModule for axolotl CLI command arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nQuantizeCliArgs\nDataclass with CLI arguments for axolotl quantize command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=False,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.QuantizeCliArgs(\n    base_model=None,\n    weight_dtype=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n    group_size=None,\n    output_dir=None,\n    hub_model_id=None,\n)\nDataclass with CLI arguments for axolotl quantize command.\n\n\n\ncli.args.TrainerCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    prompter=None,\n    shard=False,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    tensor_parallel_size=None,\n    data_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n    serve_module=None,\n    enable_reasoning=None,\n    reasoning_parser=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
+  },
+  {
+    "objectID": "docs/api/cli.args.html#classes",
+    "href": "docs/api/cli.args.html#classes",
+    "title": "cli.args",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nQuantizeCliArgs\nDataclass with CLI arguments for axolotl quantize command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=False,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.QuantizeCliArgs(\n    base_model=None,\n    weight_dtype=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n    group_size=None,\n    output_dir=None,\n    hub_model_id=None,\n)\nDataclass with CLI arguments for axolotl quantize command.\n\n\n\ncli.args.TrainerCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    prompter=None,\n    shard=False,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    tensor_parallel_size=None,\n    data_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n    serve_module=None,\n    enable_reasoning=None,\n    reasoning_parser=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
+  },
+  {
+    "objectID": "docs/api/prompt_tokenizers.html",
+    "href": "docs/api/prompt_tokenizers.html",
+    "title": "prompt_tokenizers",
+    "section": "",
+    "text": "prompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
+  },
+  {
+    "objectID": "docs/api/prompt_tokenizers.html#classes",
+    "href": "docs/api/prompt_tokenizers.html#classes",
+    "title": "prompt_tokenizers",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts."
+  },
+  {
+    "objectID": "docs/api/prompt_tokenizers.html#functions",
+    "href": "docs/api/prompt_tokenizers.html#functions",
+    "title": "prompt_tokenizers",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
+  },
+  {
+    "objectID": "docs/api/cli.vllm_serve.html",
+    "href": "docs/api/cli.vllm_serve.html",
+    "title": "cli.vllm_serve",
+    "section": "",
+    "text": "cli.vllm_serve\nCLI to start the vllm server for online RL\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
+  },
+  {
+    "objectID": "docs/api/cli.vllm_serve.html#classes",
+    "href": "docs/api/cli.vllm_serve.html#classes",
+    "title": "cli.vllm_serve",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server"
+  },
+  {
+    "objectID": "docs/api/cli.vllm_serve.html#functions",
+    "href": "docs/api/cli.vllm_serve.html#functions",
+    "title": "cli.vllm_serve",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
+  },
+  {
+    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
+    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
+    "title": "monkeypatch.data.batch_dataset_fetcher",
+    "section": "",
+    "text": "monkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
+  },
+  {
+    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
+    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
+    "title": "monkeypatch.data.batch_dataset_fetcher",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
+  },
+  {
+    "objectID": "docs/api/loaders.constants.html",
+    "href": "docs/api/loaders.constants.html",
+    "title": "loaders.constants",
+    "section": "",
+    "text": "loaders.constants\nloaders.constants\nShared constants for axolotl.loaders module"
+  },
+  {
+    "objectID": "docs/api/prompt_strategies.input_output.html",
+    "href": "docs/api/prompt_strategies.input_output.html",
+    "title": "prompt_strategies.input_output",
+    "section": "",
+    "text": "prompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\n\n\n\nName\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
+  },
+  {
+    "objectID": "docs/api/prompt_strategies.input_output.html#classes",
+    "href": "docs/api/prompt_strategies.input_output.html#classes",
+    "title": "prompt_strategies.input_output",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
+  },
+  {
+    "objectID": "docs/api/core.trainers.trl.html",
+    "href": "docs/api/core.trainers.trl.html",
+    "title": "core.trainers.trl",
+    "section": "",
+    "text": "core.trainers.trl\nModule for TRL RL trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
+  },
+  {
+    "objectID": "docs/api/core.trainers.trl.html#classes",
+    "href": "docs/api/core.trainers.trl.html#classes",
+    "title": "core.trainers.trl",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
+  },
+  {
+    "objectID": "docs/api/models.mamba.modeling_mamba.html",
+    "href": "docs/api/models.mamba.modeling_mamba.html",
+    "title": "models.mamba.modeling_mamba",
+    "section": "",
+    "text": "models.mamba.modeling_mamba\nmodels.mamba.modeling_mamba"
+  },
+  {
+    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
+    "title": "monkeypatch.stablelm_attn_hijack_flash",
+    "section": "",
+    "text": "monkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\n\n\n\nName\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
+  },
+  {
+    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
+    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
+    "title": "monkeypatch.stablelm_attn_hijack_flash",
     "section": "",
-    "text": "utils.schedulers\nModule for custom LRScheduler class\n\n\n\n\n\nName\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
+    "text": "Name\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
   },
   {
-    "objectID": "docs/api/utils.schedulers.html#classes",
-    "href": "docs/api/utils.schedulers.html#classes",
-    "title": "utils.schedulers",
+    "objectID": "docs/api/prompt_strategies.alpaca_chat.html",
+    "href": "docs/api/prompt_strategies.alpaca_chat.html",
+    "title": "prompt_strategies.alpaca_chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0"
+    "text": "prompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
   },
   {
-    "objectID": "docs/api/utils.schedulers.html#functions",
-    "href": "docs/api/utils.schedulers.html#functions",
-    "title": "utils.schedulers",
+    "objectID": "docs/api/prompt_strategies.alpaca_chat.html#classes",
+    "href": "docs/api/prompt_strategies.alpaca_chat.html#classes",
+    "title": "prompt_strategies.alpaca_chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
+    "text": "Name\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
   },
   {
-    "objectID": "docs/api/cli.utils.sweeps.html",
-    "href": "docs/api/cli.utils.sweeps.html",
-    "title": "cli.utils.sweeps",
+    "objectID": "docs/api/prompt_strategies.metharme.html",
+    "href": "docs/api/prompt_strategies.metharme.html",
+    "title": "prompt_strategies.metharme",
     "section": "",
-    "text": "cli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\n\n\n\nName\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.utils.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, Any]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
+    "text": "prompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nMetharmePromptTokenizingStrategy\nTokenizing strategy for the Metharme models\n\n\nMetharmePrompter\nPrompter for the Metharme models.\n\n\n\n\n\nprompt_strategies.metharme.MetharmePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for the Metharme models\n\n\n\nprompt_strategies.metharme.MetharmePrompter(*args, **kwargs)\nPrompter for the Metharme models."
   },
   {
-    "objectID": "docs/api/cli.utils.sweeps.html#functions",
-    "href": "docs/api/cli.utils.sweeps.html#functions",
-    "title": "cli.utils.sweeps",
+    "objectID": "docs/api/prompt_strategies.metharme.html#classes",
+    "href": "docs/api/prompt_strategies.metharme.html#classes",
+    "title": "prompt_strategies.metharme",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.utils.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, Any]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
+    "text": "Name\nDescription\n\n\n\n\nMetharmePromptTokenizingStrategy\nTokenizing strategy for the Metharme models\n\n\nMetharmePrompter\nPrompter for the Metharme models.\n\n\n\n\n\nprompt_strategies.metharme.MetharmePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for the Metharme models\n\n\n\nprompt_strategies.metharme.MetharmePrompter(*args, **kwargs)\nPrompter for the Metharme models."
   },
   {
-    "objectID": "docs/api/datasets.html",
-    "href": "docs/api/datasets.html",
-    "title": "datasets",
+    "objectID": "docs/api/utils.freeze.html",
+    "href": "docs/api/utils.freeze.html",
+    "title": "utils.freeze",
     "section": "",
-    "text": "datasets\nModule containing dataset functionality.\nWe want this to be a wrapper for an existing dataset that we have loaded. Lets use the\nconcept of middlewares to wrap each dataset. We’ll use the collators later on to pad the\ndatasets.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
+    "text": "utils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\n\n\n\nName\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
   },
   {
-    "objectID": "docs/api/datasets.html#classes",
-    "href": "docs/api/datasets.html#classes",
-    "title": "datasets",
+    "objectID": "docs/api/utils.freeze.html#classes",
+    "href": "docs/api/utils.freeze.html#classes",
+    "title": "utils.freeze",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
+    "text": "Name\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise."
   },
   {
-    "objectID": "docs/api/utils.tokenization.html",
-    "href": "docs/api/utils.tokenization.html",
-    "title": "utils.tokenization",
+    "objectID": "docs/api/utils.freeze.html#functions",
+    "href": "docs/api/utils.freeze.html#functions",
+    "title": "utils.freeze",
     "section": "",
-    "text": "utils.tokenization\nModule for tokenization utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
+    "text": "Name\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
   },
   {
-    "objectID": "docs/api/utils.tokenization.html#functions",
-    "href": "docs/api/utils.tokenization.html#functions",
-    "title": "utils.tokenization",
+    "objectID": "docs/api/logging_config.html",
+    "href": "docs/api/logging_config.html",
+    "title": "logging_config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
+    "text": "logging_config\nCommon logging module for axolotl.\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlLogger\nLogger that applies filtering to non-axolotl loggers.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nLogger that applies filtering to non-axolotl loggers.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\nINFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records\n(i.e. non-axolotl.INFO, DEBUG, etc. by default).\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
   },
   {
-    "objectID": "docs/api/loaders.tokenizer.html",
-    "href": "docs/api/loaders.tokenizer.html",
-    "title": "loaders.tokenizer",
+    "objectID": "docs/api/logging_config.html#classes",
+    "href": "docs/api/logging_config.html#classes",
+    "title": "logging_config",
     "section": "",
-    "text": "loaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlLogger\nLogger that applies filtering to non-axolotl loggers.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nLogger that applies filtering to non-axolotl loggers.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\nINFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records\n(i.e. non-axolotl.INFO, DEBUG, etc. by default).\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type"
   },
   {
-    "objectID": "docs/api/loaders.tokenizer.html#functions",
-    "href": "docs/api/loaders.tokenizer.html#functions",
-    "title": "loaders.tokenizer",
+    "objectID": "docs/api/logging_config.html#functions",
+    "href": "docs/api/logging_config.html#functions",
+    "title": "logging_config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
+    "text": "Name\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_expand_mask.html",
-    "href": "docs/api/monkeypatch.llama_expand_mask.html",
-    "title": "monkeypatch.llama_expand_mask",
+    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html",
+    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html",
+    "title": "monkeypatch.trainer_fsdp_optim",
     "section": "",
-    "text": "monkeypatch.llama_expand_mask\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf"
+    "text": "monkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
-    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
+    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
+    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
+    "title": "monkeypatch.trainer_fsdp_optim",
     "section": "",
-    "text": "monkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\n\n\n\nName\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
+    "text": "Name\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
-    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
+    "objectID": "docs/api/cli.art.html",
+    "href": "docs/api/cli.art.html",
+    "title": "cli.art",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
+    "text": "cli.art\nAxolotl ASCII logo utils.\n\n\n\n\n\nName\nDescription\n\n\n\n\nprint_axolotl_text_art\nPrints axolotl ASCII art.\n\n\n\n\n\ncli.art.print_axolotl_text_art()\nPrints axolotl ASCII art."
   },
   {
-    "objectID": "docs/api/utils.data.sft.html",
-    "href": "docs/api/utils.data.sft.html",
-    "title": "utils.data.sft",
+    "objectID": "docs/api/cli.art.html#functions",
+    "href": "docs/api/cli.art.html#functions",
+    "title": "cli.art",
     "section": "",
-    "text": "utils.data.sft\nData handling specific to SFT.\n\n\n\n\n\nName\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
+    "text": "Name\nDescription\n\n\n\n\nprint_axolotl_text_art\nPrints axolotl ASCII art.\n\n\n\n\n\ncli.art.print_axolotl_text_art()\nPrints axolotl ASCII art."
   },
   {
-    "objectID": "docs/api/utils.data.sft.html#functions",
-    "href": "docs/api/utils.data.sft.html#functions",
-    "title": "utils.data.sft",
+    "objectID": "docs/api/utils.collators.mamba.html",
+    "href": "docs/api/utils.collators.mamba.html",
+    "title": "utils.collators.mamba",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
+    "text": "utils.collators.mamba\ncollators for Mamba\n\n\n\n\n\nName\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
   },
   {
-    "objectID": "docs/api/monkeypatch.transformers_fa_utils.html",
-    "href": "docs/api/monkeypatch.transformers_fa_utils.html",
-    "title": "monkeypatch.transformers_fa_utils",
+    "objectID": "docs/api/utils.collators.mamba.html#classes",
+    "href": "docs/api/utils.collators.mamba.html#classes",
+    "title": "utils.collators.mamba",
     "section": "",
-    "text": "monkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\n\n\n\nName\nDescription\n\n\n\n\nfixed_fa_peft_integration_check\nPEFT usually casts the layer norms in float32 for training stability reasons\n\n\n\n\n\nmonkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(\n    query,\n    key,\n    value,\n    target_dtype=None,\n    preferred_dtype=None,\n)\nPEFT usually casts the layer norms in float32 for training stability reasons\ntherefore the input hidden states gets silently casted in float32. Hence, we need\ncast them back in float16 / bfloat16 just to be sure everything works as expected.\nThis might slowdown training & inference so it is recommended to not cast the LayerNorms!\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nquery\ntorch.Tensor\nInput query states to be passed to Flash Attention API\nrequired\n\n\nkey\ntorch.Tensor\nInput key states to be passed to Flash Attention API\nrequired\n\n\nvalue\ntorch.Tensor\nInput value states to be passed to Flash Attention API\nrequired\n\n\ntarget_dtype\ntorch.dtype, optional\nThe dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype.\nNone\n\n\npreferred_dtype\ntorch.dtype, optional\nThe preferred dtype to convert the attention tensors to regardless of the target dtype.\nNone"
+    "text": "Name\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
   },
   {
-    "objectID": "docs/api/monkeypatch.transformers_fa_utils.html#functions",
-    "href": "docs/api/monkeypatch.transformers_fa_utils.html#functions",
-    "title": "monkeypatch.transformers_fa_utils",
+    "objectID": "docs/api/cli.utils.train.html",
+    "href": "docs/api/cli.utils.train.html",
+    "title": "cli.utils.train",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nfixed_fa_peft_integration_check\nPEFT usually casts the layer norms in float32 for training stability reasons\n\n\n\n\n\nmonkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(\n    query,\n    key,\n    value,\n    target_dtype=None,\n    preferred_dtype=None,\n)\nPEFT usually casts the layer norms in float32 for training stability reasons\ntherefore the input hidden states gets silently casted in float32. Hence, we need\ncast them back in float16 / bfloat16 just to be sure everything works as expected.\nThis might slowdown training & inference so it is recommended to not cast the LayerNorms!\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nquery\ntorch.Tensor\nInput query states to be passed to Flash Attention API\nrequired\n\n\nkey\ntorch.Tensor\nInput key states to be passed to Flash Attention API\nrequired\n\n\nvalue\ntorch.Tensor\nInput value states to be passed to Flash Attention API\nrequired\n\n\ntarget_dtype\ntorch.dtype, optional\nThe dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype.\nNone\n\n\npreferred_dtype\ntorch.dtype, optional\nThe preferred dtype to convert the attention tensors to regardless of the target dtype.\nNone"
+    "text": "cli.utils.train\nUtilities for axolotl train CLI command.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ngenerate_config_files\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\n\n\nlaunch_training\nExecute training with the given configuration.\n\n\n\n\n\ncli.utils.train.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.train.generate_config_files(config, sweep)\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\nwhether this is a group of configurations (i.e., a sweep).\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nBase configuration file\nrequired\n\n\nsweep\nstr | None\nSweep configuration file\nrequired\n\n\n\n\n\n\n\ncli.utils.train.launch_training(\n    cfg_file,\n    launcher,\n    cloud,\n    kwargs,\n    launcher_args=None,\n    use_exec=False,\n)\nExecute training with the given configuration."
   },
   {
-    "objectID": "docs/api/loaders.patch_manager.html",
-    "href": "docs/api/loaders.patch_manager.html",
-    "title": "loaders.patch_manager",
+    "objectID": "docs/api/cli.utils.train.html#functions",
+    "href": "docs/api/cli.utils.train.html#functions",
+    "title": "cli.utils.train",
     "section": "",
-    "text": "loaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\nApplies pre- and post-model load patches for various fixes and optimizations.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_config_load_patches\nApply patches that must be set up before config loading.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\napply_pre_tokenizer_load_patches\nApply patches that must be set up before tokenizer loading.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_config_load_patches(cfg)\nApply patches that must be set up before config loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoConfig.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired\n\n\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches(cfg)\nApply patches that must be set up before tokenizer loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoTokenizer.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ngenerate_config_files\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\n\n\nlaunch_training\nExecute training with the given configuration.\n\n\n\n\n\ncli.utils.train.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.train.generate_config_files(config, sweep)\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\nwhether this is a group of configurations (i.e., a sweep).\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nBase configuration file\nrequired\n\n\nsweep\nstr | None\nSweep configuration file\nrequired\n\n\n\n\n\n\n\ncli.utils.train.launch_training(\n    cfg_file,\n    launcher,\n    cloud,\n    kwargs,\n    launcher_args=None,\n    use_exec=False,\n)\nExecute training with the given configuration."
   },
   {
-    "objectID": "docs/api/loaders.patch_manager.html#classes",
-    "href": "docs/api/loaders.patch_manager.html#classes",
-    "title": "loaders.patch_manager",
+    "objectID": "docs/api/integrations.base.html",
+    "href": "docs/api/integrations.base.html",
+    "title": "integrations.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_config_load_patches\nApply patches that must be set up before config loading.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\napply_pre_tokenizer_load_patches\nApply patches that must be set up before tokenizer loading.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_config_load_patches(cfg)\nApply patches that must be set up before config loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoConfig.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired\n\n\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches(cfg)\nApply patches that must be set up before tokenizer loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoTokenizer.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired"
+    "text": "integrations.base\nBase class for all plugins.\nA plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.\nPlugins can be used to integrate third-party models, modify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntype[Trainer] | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported.\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/integrations.liger.args.html",
-    "href": "docs/api/integrations.liger.args.html",
-    "title": "integrations.liger.args",
+    "objectID": "docs/api/integrations.base.html#classes",
+    "href": "docs/api/integrations.base.html#classes",
+    "title": "integrations.base",
     "section": "",
-    "text": "integrations.liger.args\nModule for handling LIGER input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
+    "text": "Name\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntype[Trainer] | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/integrations.liger.args.html#classes",
-    "href": "docs/api/integrations.liger.args.html#classes",
-    "title": "integrations.liger.args",
+    "objectID": "docs/api/integrations.base.html#functions",
+    "href": "docs/api/integrations.base.html#functions",
+    "title": "integrations.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
+    "text": "Name\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/utils.schemas.peft.html",
-    "href": "docs/api/utils.schemas.peft.html",
-    "title": "utils.schemas.peft",
+    "objectID": "docs/api/core.trainers.dpo.trainer.html",
+    "href": "docs/api/core.trainers.dpo.trainer.html",
+    "title": "core.trainers.dpo.trainer",
     "section": "",
-    "text": "utils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
+    "text": "core.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlDPOTrainer\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer(*args, dataset_tags=None, **kwargs)\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\nName\nDescription\n\n\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing\nthe model on the Hub. Please refer to ~transformers.Trainer.push_to_hub\nfor more details."
   },
   {
-    "objectID": "docs/api/utils.schemas.peft.html#classes",
-    "href": "docs/api/utils.schemas.peft.html#classes",
-    "title": "utils.schemas.peft",
+    "objectID": "docs/api/core.trainers.dpo.trainer.html#classes",
+    "href": "docs/api/core.trainers.dpo.trainer.html#classes",
+    "title": "core.trainers.dpo.trainer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlDPOTrainer\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer(*args, dataset_tags=None, **kwargs)\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\nName\nDescription\n\n\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing\nthe model on the Hub. Please refer to ~transformers.Trainer.push_to_hub\nfor more details."
   },
   {
-    "objectID": "docs/api/prompt_strategies.pygmalion.html",
-    "href": "docs/api/prompt_strategies.pygmalion.html",
-    "title": "prompt_strategies.pygmalion",
+    "objectID": "docs/api/cli.checks.html",
+    "href": "docs/api/cli.checks.html",
+    "title": "cli.checks",
     "section": "",
-    "text": "prompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
+    "text": "cli.checks\nVarious checks for Axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
   },
   {
-    "objectID": "docs/api/prompt_strategies.pygmalion.html#classes",
-    "href": "docs/api/prompt_strategies.pygmalion.html#classes",
-    "title": "prompt_strategies.pygmalion",
+    "objectID": "docs/api/cli.checks.html#functions",
+    "href": "docs/api/cli.checks.html#functions",
+    "title": "cli.checks",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
+    "text": "Name\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_instruct.html",
-    "href": "docs/api/prompt_strategies.alpaca_instruct.html",
-    "title": "prompt_strategies.alpaca_instruct",
+    "objectID": "docs/api/common.architectures.html",
+    "href": "docs/api/common.architectures.html",
+    "title": "common.architectures",
     "section": "",
-    "text": "prompt_strategies.alpaca_instruct\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class"
+    "text": "common.architectures\ncommon.architectures\nCommon architecture specific constants"
   },
   {
-    "objectID": "docs/api/cli.cloud.base.html",
-    "href": "docs/api/cli.cloud.base.html",
-    "title": "cli.cloud.base",
+    "objectID": "docs/api/utils.bench.html",
+    "href": "docs/api/utils.bench.html",
+    "title": "utils.bench",
     "section": "",
-    "text": "cli.cloud.base\nbase class for cloud platforms from cli\n\n\n\n\n\nName\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
+    "text": "utils.bench\nBenchmarking and measurement utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
   },
   {
-    "objectID": "docs/api/cli.cloud.base.html#classes",
-    "href": "docs/api/cli.cloud.base.html#classes",
-    "title": "cli.cloud.base",
+    "objectID": "docs/api/utils.bench.html#functions",
+    "href": "docs/api/utils.bench.html#functions",
+    "title": "utils.bench",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
+    "text": "Name\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
-    "title": "monkeypatch.gradient_checkpointing.offload_disk",
+    "objectID": "docs/api/utils.data.streaming.html",
+    "href": "docs/api/utils.data.streaming.html",
+    "title": "utils.data.streaming",
     "section": "",
-    "text": "monkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\nName\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
+    "text": "utils.data.streaming\nutils.data.streaming\nData handling specific to streaming datasets."
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
-    "title": "monkeypatch.gradient_checkpointing.offload_disk",
+    "objectID": "docs/api/utils.chat_templates.html",
+    "href": "docs/api/utils.chat_templates.html",
+    "title": "utils.chat_templates",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
+    "text": "utils.chat_templates\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\nThese templates are used for formatting messages in a conversation."
   },
   {
-    "objectID": "docs/api/kernels.swiglu.html",
-    "href": "docs/api/kernels.swiglu.html",
-    "title": "kernels.swiglu",
+    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html",
+    "href": "docs/api/prompt_strategies.alpaca_w_system.html",
+    "title": "prompt_strategies.alpaca_w_system",
     "section": "",
-    "text": "kernels.swiglu\nModule for definition of SwiGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "prompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\n\n\n\nName\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
   },
   {
-    "objectID": "docs/api/kernels.swiglu.html#functions",
-    "href": "docs/api/kernels.swiglu.html#functions",
-    "title": "kernels.swiglu",
+    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
+    "href": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
+    "title": "prompt_strategies.alpaca_w_system",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "Name\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
   },
   {
-    "objectID": "docs/api/integrations.cut_cross_entropy.args.html",
-    "href": "docs/api/integrations.cut_cross_entropy.args.html",
-    "title": "integrations.cut_cross_entropy.args",
+    "objectID": "docs/api/core.trainers.utils.html",
+    "href": "docs/api/core.trainers.utils.html",
+    "title": "core.trainers.utils",
     "section": "",
-    "text": "integrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
+    "text": "core.trainers.utils\ncore.trainers.utils\nUtils for Axolotl trainers"
   },
   {
-    "objectID": "docs/api/integrations.cut_cross_entropy.args.html#classes",
-    "href": "docs/api/integrations.cut_cross_entropy.args.html#classes",
-    "title": "integrations.cut_cross_entropy.args",
+    "objectID": "docs/api/prompt_strategies.dpo.zephyr.html",
+    "href": "docs/api/prompt_strategies.dpo.zephyr.html",
+    "title": "prompt_strategies.dpo.zephyr",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
+    "text": "prompt_strategies.dpo.zephyr\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.user_defined.html",
-    "href": "docs/api/prompt_strategies.kto.user_defined.html",
-    "title": "prompt_strategies.kto.user_defined",
+    "objectID": "docs/api/cli.evaluate.html",
+    "href": "docs/api/cli.evaluate.html",
+    "title": "cli.evaluate",
     "section": "",
-    "text": "prompt_strategies.kto.user_defined\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies"
+    "text": "cli.evaluate\nCLI to run evaluation on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\ndo_evaluate\nEvaluates a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.evaluate.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.evaluate.do_evaluate(cfg, cli_args)\nEvaluates a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.evaluate.evaluate, which computes\nevaluation metrics on the given dataset(s) and writes them to disk.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nCLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/monkeypatch.utils.html",
-    "href": "docs/api/monkeypatch.utils.html",
-    "title": "monkeypatch.utils",
+    "objectID": "docs/api/cli.evaluate.html#functions",
+    "href": "docs/api/cli.evaluate.html#functions",
+    "title": "cli.evaluate",
     "section": "",
-    "text": "monkeypatch.utils\nShared utils for the monkeypatches\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\nmask_2d_to_4d\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\nmonkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\nThis expansion handles packed sequences so that sequences share the same attention mask integer value\nwhen they attend to each other within that sequence.\nThis expansion transforms the mask to lower triangular form to prevent future peeking."
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\ndo_evaluate\nEvaluates a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.evaluate.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.evaluate.do_evaluate(cfg, cli_args)\nEvaluates a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.evaluate.evaluate, which computes\nevaluation metrics on the given dataset(s) and writes them to disk.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nCLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/monkeypatch.utils.html#functions",
-    "href": "docs/api/monkeypatch.utils.html#functions",
-    "title": "monkeypatch.utils",
+    "objectID": "docs/api/monkeypatch.relora.html",
+    "href": "docs/api/monkeypatch.relora.html",
+    "title": "monkeypatch.relora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\nmask_2d_to_4d\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\nmonkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\nThis expansion handles packed sequences so that sequences share the same attention mask integer value\nwhen they attend to each other within that sequence.\nThis expansion transforms the mask to lower triangular form to prevent future peeking."
+    "text": "monkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\n\n\n\nName\nDescription\n\n\n\n\nReLoRACallback\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\n\n\n\nmonkeypatch.relora.ReLoRACallback(cfg)\nCallback to merge LoRA weights into the base model and save full-weight checkpoints"
   },
   {
-    "objectID": "docs/api/core.builders.rl.html",
-    "href": "docs/api/core.builders.rl.html",
-    "title": "core.builders.rl",
+    "objectID": "docs/api/monkeypatch.relora.html#classes",
+    "href": "docs/api/monkeypatch.relora.html#classes",
+    "title": "monkeypatch.relora",
     "section": "",
-    "text": "core.builders.rl\nBuilder for RLHF trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
+    "text": "Name\nDescription\n\n\n\n\nReLoRACallback\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\n\n\n\nmonkeypatch.relora.ReLoRACallback(cfg)\nCallback to merge LoRA weights into the base model and save full-weight checkpoints"
   },
   {
-    "objectID": "docs/api/core.builders.rl.html#classes",
-    "href": "docs/api/core.builders.rl.html#classes",
-    "title": "core.builders.rl",
+    "objectID": "docs/api/prompt_strategies.dpo.chatml.html",
+    "href": "docs/api/prompt_strategies.dpo.chatml.html",
+    "title": "prompt_strategies.dpo.chatml",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
+    "text": "prompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/loaders.processor.html",
-    "href": "docs/api/loaders.processor.html",
-    "title": "loaders.processor",
+    "objectID": "docs/api/prompt_strategies.dpo.chatml.html#functions",
+    "href": "docs/api/prompt_strategies.dpo.chatml.html#functions",
+    "title": "prompt_strategies.dpo.chatml",
     "section": "",
-    "text": "loaders.processor\nloaders.processor\nProcessor loading functionality for multi-modal models"
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/utils.callbacks.lisa.html",
-    "href": "docs/api/utils.callbacks.lisa.html",
-    "title": "utils.callbacks.lisa",
+    "objectID": "docs/api/cli.utils.fetch.html",
+    "href": "docs/api/cli.utils.fetch.html",
+    "title": "cli.utils.fetch",
     "section": "",
-    "text": "utils.callbacks.lisa\nutils.callbacks.lisa\nmodule for LISA\nAdapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl\nArxiv: https://arxiv.org/abs/2403.17919\nLicense: Apache 2.0"
+    "text": "cli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
   },
   {
-    "objectID": "docs/api/core.training_args.html",
-    "href": "docs/api/core.training_args.html",
-    "title": "core.training_args",
+    "objectID": "docs/api/cli.utils.fetch.html#functions",
+    "href": "docs/api/cli.utils.fetch.html#functions",
+    "title": "cli.utils.fetch",
     "section": "",
-    "text": "core.training_args\nextra axolotl specific training args\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(simpo_gamma=None)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig()\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig()\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig()\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig()\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments()\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin."
+    "text": "Name\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
   },
   {
-    "objectID": "docs/api/core.training_args.html#classes",
-    "href": "docs/api/core.training_args.html#classes",
-    "title": "core.training_args",
+    "objectID": "docs/api/utils.schemas.datasets.html",
+    "href": "docs/api/utils.schemas.datasets.html",
+    "title": "utils.schemas.datasets",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(simpo_gamma=None)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig()\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig()\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig()\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig()\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments()\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin."
+    "text": "utils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nDPODataset\nDPO configuration subset\n\n\nKTODataset\nKTO configuration subset\n\n\nPretrainingDataset\nPretraining dataset configuration subset\n\n\nSFTDataset\nSFT configuration subset\n\n\nStepwiseSupervisedDataset\nStepwise supervised dataset configuration subset\n\n\nUserDefinedDPOType\nUser defined typing for DPO\n\n\nUserDefinedKTOType\nUser defined typing for KTO\n\n\nUserDefinedPrompterType\nStructure for user defined prompt types\n\n\n\n\n\nutils.schemas.datasets.DPODataset()\nDPO configuration subset\n\n\n\nutils.schemas.datasets.KTODataset()\nKTO configuration subset\n\n\n\nutils.schemas.datasets.PretrainingDataset()\nPretraining dataset configuration subset\n\n\n\nutils.schemas.datasets.SFTDataset()\nSFT configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.SFTDataset.handle_legacy_message_fields(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.StepwiseSupervisedDataset()\nStepwise supervised dataset configuration subset\n\n\n\nutils.schemas.datasets.UserDefinedDPOType()\nUser defined typing for DPO\n\n\n\nutils.schemas.datasets.UserDefinedKTOType()\nUser defined typing for KTO\n\n\n\nutils.schemas.datasets.UserDefinedPrompterType()\nStructure for user defined prompt types"
   },
   {
-    "objectID": "docs/api/loaders.adapter.html",
-    "href": "docs/api/loaders.adapter.html",
-    "title": "loaders.adapter",
+    "objectID": "docs/api/utils.schemas.datasets.html#classes",
+    "href": "docs/api/utils.schemas.datasets.html#classes",
+    "title": "utils.schemas.datasets",
     "section": "",
-    "text": "loaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
+    "text": "Name\nDescription\n\n\n\n\nDPODataset\nDPO configuration subset\n\n\nKTODataset\nKTO configuration subset\n\n\nPretrainingDataset\nPretraining dataset configuration subset\n\n\nSFTDataset\nSFT configuration subset\n\n\nStepwiseSupervisedDataset\nStepwise supervised dataset configuration subset\n\n\nUserDefinedDPOType\nUser defined typing for DPO\n\n\nUserDefinedKTOType\nUser defined typing for KTO\n\n\nUserDefinedPrompterType\nStructure for user defined prompt types\n\n\n\n\n\nutils.schemas.datasets.DPODataset()\nDPO configuration subset\n\n\n\nutils.schemas.datasets.KTODataset()\nKTO configuration subset\n\n\n\nutils.schemas.datasets.PretrainingDataset()\nPretraining dataset configuration subset\n\n\n\nutils.schemas.datasets.SFTDataset()\nSFT configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.SFTDataset.handle_legacy_message_fields(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.StepwiseSupervisedDataset()\nStepwise supervised dataset configuration subset\n\n\n\nutils.schemas.datasets.UserDefinedDPOType()\nUser defined typing for DPO\n\n\n\nutils.schemas.datasets.UserDefinedKTOType()\nUser defined typing for KTO\n\n\n\nutils.schemas.datasets.UserDefinedPrompterType()\nStructure for user defined prompt types"
   },
   {
-    "objectID": "docs/api/loaders.adapter.html#functions",
-    "href": "docs/api/loaders.adapter.html#functions",
-    "title": "loaders.adapter",
+    "objectID": "docs/api/common.datasets.html",
+    "href": "docs/api/common.datasets.html",
+    "title": "common.datasets",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
+    "text": "common.datasets\nDataset loading utilities.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
   },
   {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html",
-    "title": "cli.merge_sharded_fsdp_weights",
+    "objectID": "docs/api/common.datasets.html#classes",
+    "href": "docs/api/common.datasets.html#classes",
+    "title": "common.datasets",
     "section": "",
-    "text": "cli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
+    "text": "Name\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata."
   },
   {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
-    "title": "cli.merge_sharded_fsdp_weights",
+    "objectID": "docs/api/common.datasets.html#functions",
+    "href": "docs/api/common.datasets.html#functions",
+    "title": "common.datasets",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading."
+    "text": "Name\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
   },
   {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
-    "title": "cli.merge_sharded_fsdp_weights",
+    "objectID": "docs/api/utils.callbacks.mlflow_.html",
+    "href": "docs/api/utils.callbacks.mlflow_.html",
+    "title": "utils.callbacks.mlflow_",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
+    "text": "utils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
   },
   {
-    "objectID": "docs/api/cli.train.html",
-    "href": "docs/api/cli.train.html",
-    "title": "cli.train",
+    "objectID": "docs/api/utils.callbacks.mlflow_.html#classes",
+    "href": "docs/api/utils.callbacks.mlflow_.html#classes",
+    "title": "utils.callbacks.mlflow_",
     "section": "",
-    "text": "cli.train\nCLI to run training on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
   },
   {
-    "objectID": "docs/api/cli.train.html#functions",
-    "href": "docs/api/cli.train.html#functions",
-    "title": "cli.train",
+    "objectID": "docs/api/prompt_strategies.kto.chatml.html",
+    "href": "docs/api/prompt_strategies.kto.chatml.html",
+    "title": "prompt_strategies.kto.chatml",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
+    "text": "prompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html",
-    "href": "docs/api/core.trainers.mixins.rng_state_loader.html",
-    "title": "core.trainers.mixins.rng_state_loader",
+    "objectID": "docs/api/prompt_strategies.kto.chatml.html#functions",
+    "href": "docs/api/prompt_strategies.kto.chatml.html#functions",
+    "title": "prompt_strategies.kto.chatml",
     "section": "",
-    "text": "core.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\nSee https://github.com/huggingface/transformers/pull/37162\nTODO: Remove when upstream added PR to release\n\n\n\n\n\nName\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
-    "href": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
-    "title": "core.trainers.mixins.rng_state_loader",
+    "objectID": "docs/api/utils.schemas.model.html",
+    "href": "docs/api/utils.schemas.model.html",
+    "title": "utils.schemas.model",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
+    "text": "utils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
   },
   {
-    "objectID": "docs/api/prompt_strategies.completion.html",
-    "href": "docs/api/prompt_strategies.completion.html",
-    "title": "prompt_strategies.completion",
+    "objectID": "docs/api/utils.schemas.model.html#classes",
+    "href": "docs/api/utils.schemas.model.html#classes",
+    "title": "utils.schemas.model",
     "section": "",
-    "text": "prompt_strategies.completion\nBasic completion text\n\n\n\n\n\nName\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
+    "text": "Name\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
   },
   {
-    "objectID": "docs/api/prompt_strategies.completion.html#classes",
-    "href": "docs/api/prompt_strategies.completion.html#classes",
-    "title": "prompt_strategies.completion",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
+    "title": "monkeypatch.llama_attn_hijack_xformers",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
+    "text": "monkeypatch.llama_attn_hijack_xformers\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments"
   },
   {
-    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html",
-    "href": "docs/api/prompt_strategies.stepwise_supervised.html",
-    "title": "prompt_strategies.stepwise_supervised",
+    "objectID": "docs/api/monkeypatch.llama_patch_multipack.html",
+    "href": "docs/api/monkeypatch.llama_patch_multipack.html",
+    "title": "monkeypatch.llama_patch_multipack",
     "section": "",
-    "text": "prompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\nand (optionally) per-step, or per-prompt-trace labels for reward modelling.\n\n\n\n\n\nName\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
+    "text": "monkeypatch.llama_patch_multipack\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention"
   },
   {
-    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
-    "href": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
-    "title": "prompt_strategies.stepwise_supervised",
+    "objectID": "docs/api/core.trainers.base.html",
+    "href": "docs/api/core.trainers.base.html",
+    "title": "core.trainers.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
+    "text": "core.trainers.base\nModule for customized trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\nstore_metrics\nStore metrics with specified reduction type.\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.\n\n\n\ncore.trainers.base.AxolotlTrainer.store_metrics(\n    metrics,\n    train_eval='train',\n    reduction='mean',\n)\nStore metrics with specified reduction type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmetrics\ndict[str, float] | dict[str, tuple[int | float, str]]\nDictionary of metric names to values, or metric names to (value, reduction_type) tuples.\nrequired\n\n\ntrain_eval\nLiteral['train', 'eval']\nWhether this is for training or evaluation.\n'train'"
   },
   {
-    "objectID": "docs/api/monkeypatch.lora_kernels.html",
-    "href": "docs/api/monkeypatch.lora_kernels.html",
-    "title": "monkeypatch.lora_kernels",
+    "objectID": "docs/api/core.trainers.base.html#classes",
+    "href": "docs/api/core.trainers.base.html#classes",
+    "title": "core.trainers.base",
     "section": "",
-    "text": "monkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\n\n\n\nName\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
+    "text": "Name\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\nstore_metrics\nStore metrics with specified reduction type.\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.\n\n\n\ncore.trainers.base.AxolotlTrainer.store_metrics(\n    metrics,\n    train_eval='train',\n    reduction='mean',\n)\nStore metrics with specified reduction type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmetrics\ndict[str, float] | dict[str, tuple[int | float, str]]\nDictionary of metric names to values, or metric names to (value, reduction_type) tuples.\nrequired\n\n\ntrain_eval\nLiteral['train', 'eval']\nWhether this is for training or evaluation.\n'train'"
   },
   {
-    "objectID": "docs/api/monkeypatch.lora_kernels.html#classes",
-    "href": "docs/api/monkeypatch.lora_kernels.html#classes",
-    "title": "monkeypatch.lora_kernels",
+    "objectID": "docs/api/cli.utils.args.html",
+    "href": "docs/api/cli.utils.args.html",
+    "title": "cli.utils.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching"
+    "text": "cli.utils.args\nUtilities for axolotl CLI args.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\n\n\n\ncli.utils.args.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function."
   },
   {
-    "objectID": "docs/api/monkeypatch.lora_kernels.html#functions",
-    "href": "docs/api/monkeypatch.lora_kernels.html#functions",
-    "title": "monkeypatch.lora_kernels",
+    "objectID": "docs/api/cli.utils.args.html#functions",
+    "href": "docs/api/cli.utils.args.html#functions",
+    "title": "cli.utils.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
+    "text": "Name\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\n\n\n\ncli.utils.args.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function."
   },
   {
-    "objectID": "docs/api/prompt_strategies.messages.chat.html",
-    "href": "docs/api/prompt_strategies.messages.chat.html",
-    "title": "prompt_strategies.messages.chat",
+    "objectID": "docs/api/utils.schemas.trl.html",
+    "href": "docs/api/utils.schemas.trl.html",
+    "title": "utils.schemas.trl",
     "section": "",
-    "text": "prompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
+    "text": "utils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
   },
   {
-    "objectID": "docs/api/prompt_strategies.messages.chat.html#classes",
-    "href": "docs/api/prompt_strategies.messages.chat.html#classes",
-    "title": "prompt_strategies.messages.chat",
+    "objectID": "docs/api/utils.schemas.trl.html#classes",
+    "href": "docs/api/utils.schemas.trl.html#classes",
+    "title": "utils.schemas.trl",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
+    "text": "Name\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
   },
   {
-    "objectID": "docs/api/prompt_strategies.user_defined.html",
-    "href": "docs/api/prompt_strategies.user_defined.html",
-    "title": "prompt_strategies.user_defined",
+    "objectID": "docs/api/core.builders.base.html",
+    "href": "docs/api/core.builders.base.html",
+    "title": "core.builders.base",
     "section": "",
-    "text": "prompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\n\n\n\nName\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
+    "text": "core.builders.base\nBase class for trainer builder\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
   },
   {
-    "objectID": "docs/api/prompt_strategies.user_defined.html#classes",
-    "href": "docs/api/prompt_strategies.user_defined.html#classes",
-    "title": "prompt_strategies.user_defined",
+    "objectID": "docs/api/core.builders.base.html#classes",
+    "href": "docs/api/core.builders.base.html#classes",
+    "title": "core.builders.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
+    "text": "Name\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
   },
   {
-    "objectID": "docs/api/core.chat.messages.html",
-    "href": "docs/api/core.chat.messages.html",
-    "title": "core.chat.messages",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
+    "title": "monkeypatch.llama_attn_hijack_flash",
     "section": "",
-    "text": "core.chat.messages\ninternal message representations of chat messages\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
+    "text": "monkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\n\n\n\nName\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
   },
   {
-    "objectID": "docs/api/core.chat.messages.html#classes",
-    "href": "docs/api/core.chat.messages.html#classes",
-    "title": "core.chat.messages",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
+    "title": "monkeypatch.llama_attn_hijack_flash",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
+    "text": "Name\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.scheduler.html",
-    "href": "docs/api/core.trainers.mixins.scheduler.html",
-    "title": "core.trainers.mixins.scheduler",
+    "objectID": "docs/api/cli.cloud.modal_.html",
+    "href": "docs/api/cli.cloud.modal_.html",
+    "title": "cli.cloud.modal_",
     "section": "",
-    "text": "core.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
+    "text": "cli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\nName\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.scheduler.html#classes",
-    "href": "docs/api/core.trainers.mixins.scheduler.html#classes",
-    "title": "core.trainers.mixins.scheduler",
+    "objectID": "docs/api/cli.cloud.modal_.html#classes",
+    "href": "docs/api/cli.cloud.modal_.html#classes",
+    "title": "cli.cloud.modal_",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
+    "text": "Name\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation."
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.user_defined.html",
-    "href": "docs/api/prompt_strategies.dpo.user_defined.html",
-    "title": "prompt_strategies.dpo.user_defined",
+    "objectID": "docs/api/cli.cloud.modal_.html#functions",
+    "href": "docs/api/cli.cloud.modal_.html#functions",
+    "title": "cli.cloud.modal_",
     "section": "",
-    "text": "prompt_strategies.dpo.user_defined\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies"
+    "text": "Name\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.llama3.html",
-    "href": "docs/api/prompt_strategies.kto.llama3.html",
-    "title": "prompt_strategies.kto.llama3",
+    "objectID": "docs/api/utils.optimizers.adopt.html",
+    "href": "docs/api/utils.optimizers.adopt.html",
+    "title": "utils.optimizers.adopt",
     "section": "",
-    "text": "prompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "utils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\nADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024)\nTaniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka\n\n\n\n\n\nName\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.llama3.html#functions",
-    "href": "docs/api/prompt_strategies.kto.llama3.html#functions",
-    "title": "prompt_strategies.kto.llama3",
+    "objectID": "docs/api/utils.optimizers.adopt.html#functions",
+    "href": "docs/api/utils.optimizers.adopt.html#functions",
+    "title": "utils.optimizers.adopt",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "Name\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
   },
   {
-    "objectID": "docs/api/utils.schemas.integrations.html",
-    "href": "docs/api/utils.schemas.integrations.html",
-    "title": "utils.schemas.integrations",
+    "objectID": "docs/api/integrations.spectrum.args.html",
+    "href": "docs/api/integrations.spectrum.args.html",
+    "title": "integrations.spectrum.args",
     "section": "",
-    "text": "utils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\n\n\n\nName\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nOpenTelemetryConfig\nOpenTelemetry configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nTrackioConfig\nTrackio configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.OpenTelemetryConfig()\nOpenTelemetry configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.TrackioConfig()\nTrackio configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
+    "text": "integrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
   },
   {
-    "objectID": "docs/api/utils.schemas.integrations.html#classes",
-    "href": "docs/api/utils.schemas.integrations.html#classes",
-    "title": "utils.schemas.integrations",
+    "objectID": "docs/api/integrations.spectrum.args.html#classes",
+    "href": "docs/api/integrations.spectrum.args.html#classes",
+    "title": "integrations.spectrum.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nOpenTelemetryConfig\nOpenTelemetry configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nTrackioConfig\nTrackio configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.OpenTelemetryConfig()\nOpenTelemetry configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.TrackioConfig()\nTrackio configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
+    "text": "Name\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
   },
   {
-    "objectID": "docs/api/convert.html",
-    "href": "docs/api/convert.html",
-    "title": "convert",
+    "objectID": "docs/api/cli.main.html",
+    "href": "docs/api/cli.main.html",
+    "title": "cli.main",
     "section": "",
-    "text": "convert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nFileReader\nReads a file and returns its contents as a string\n\n\nFileWriter\nWrites a string to a file\n\n\nJsonParser\nParses a string as JSON and returns the result\n\n\nJsonToJsonlConverter\nConverts a JSON file to JSONL\n\n\nJsonlSerializer\nSerializes a list of JSON objects into a JSONL string\n\n\nStdoutWriter\nWrites a string to stdout\n\n\n\n\n\nconvert.FileReader()\nReads a file and returns its contents as a string\n\n\n\nconvert.FileWriter(file_path)\nWrites a string to a file\n\n\n\nconvert.JsonParser()\nParses a string as JSON and returns the result\n\n\n\nconvert.JsonToJsonlConverter(\n    file_reader,\n    file_writer,\n    json_parser,\n    jsonl_serializer,\n)\nConverts a JSON file to JSONL\n\n\n\nconvert.JsonlSerializer()\nSerializes a list of JSON objects into a JSONL string\n\n\n\nconvert.StdoutWriter()\nWrites a string to stdout"
+    "text": "cli.main\nClick CLI definitions for various axolotl commands.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
   },
   {
-    "objectID": "docs/api/convert.html#classes",
-    "href": "docs/api/convert.html#classes",
-    "title": "convert",
+    "objectID": "docs/api/cli.main.html#functions",
+    "href": "docs/api/cli.main.html#functions",
+    "title": "cli.main",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nFileReader\nReads a file and returns its contents as a string\n\n\nFileWriter\nWrites a string to a file\n\n\nJsonParser\nParses a string as JSON and returns the result\n\n\nJsonToJsonlConverter\nConverts a JSON file to JSONL\n\n\nJsonlSerializer\nSerializes a list of JSON objects into a JSONL string\n\n\nStdoutWriter\nWrites a string to stdout\n\n\n\n\n\nconvert.FileReader()\nReads a file and returns its contents as a string\n\n\n\nconvert.FileWriter(file_path)\nWrites a string to a file\n\n\n\nconvert.JsonParser()\nParses a string as JSON and returns the result\n\n\n\nconvert.JsonToJsonlConverter(\n    file_reader,\n    file_writer,\n    json_parser,\n    jsonl_serializer,\n)\nConverts a JSON file to JSONL\n\n\n\nconvert.JsonlSerializer()\nSerializes a list of JSON objects into a JSONL string\n\n\n\nconvert.StdoutWriter()\nWrites a string to stdout"
+    "text": "Name\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.passthrough.html",
-    "href": "docs/api/prompt_strategies.dpo.passthrough.html",
-    "title": "prompt_strategies.dpo.passthrough",
+    "objectID": "docs/api/kernels.lora.html",
+    "href": "docs/api/kernels.lora.html",
+    "title": "kernels.lora",
     "section": "",
-    "text": "prompt_strategies.dpo.passthrough\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy"
+    "text": "kernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\nSee “LoRA: Low-Rank Adaptation of Large Language Models”\n(https://arxiv.org/abs/2106.09685).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/biases/quantization states\n\n\n\nNone\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\ntorch.Tensor | None\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_bias,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_bias,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_bias,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_bias\ntorch.Tensor | None\nGate projection bias\nrequired\n\n\ngate_quant\nQuantState | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp projection weight\nrequired\n\n\nup_quant\nQuantState | None\nUp projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown projection weight\nrequired\n\n\ndown_bias\ntorch.Tensor | None\nDown projection bias\nrequired\n\n\ndown_quant\nQuantState | None\nDown projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, b, W_quant, A, B, s)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nb\ntorch.Tensor\nOutput projection bias\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection result\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_bias,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_bias,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_bias,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_bias\ntorch.Tensor | None\nQuery projection bias\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_bias\ntorch.Tensor | None\nKey projection bias\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_bias\ntorch.Tensor | None\nValue projection bias\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weights, quantization state, LoRA A and B weights,\n\n\n\ntorch.Tensor | None\nscaling factor, and base layer bias. Quant state, weights, and bias may be\n\n\n\nQuantState | None\nNone if not available.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, b, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
   },
   {
-    "objectID": "docs/api/utils.schemas.config.html",
-    "href": "docs/api/utils.schemas.config.html",
-    "title": "utils.schemas.config",
+    "objectID": "docs/api/kernels.lora.html#classes",
+    "href": "docs/api/kernels.lora.html#classes",
+    "title": "kernels.lora",
     "section": "",
-    "text": "utils.schemas.config\nModule with Pydantic models for configuration.\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nWrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nWrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options."
+    "text": "Name\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/biases/quantization states\n\n\n\nNone\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\ntorch.Tensor | None\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_bias,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_bias,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_bias,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_bias\ntorch.Tensor | None\nGate projection bias\nrequired\n\n\ngate_quant\nQuantState | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp projection weight\nrequired\n\n\nup_quant\nQuantState | None\nUp projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown projection weight\nrequired\n\n\ndown_bias\ntorch.Tensor | None\nDown projection bias\nrequired\n\n\ndown_quant\nQuantState | None\nDown projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, b, W_quant, A, B, s)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nb\ntorch.Tensor\nOutput projection bias\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection result\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_bias,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_bias,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_bias,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_bias\ntorch.Tensor | None\nQuery projection bias\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_bias\ntorch.Tensor | None\nKey projection bias\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_bias\ntorch.Tensor | None\nValue projection bias\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors"
   },
   {
-    "objectID": "docs/api/utils.schemas.config.html#classes",
-    "href": "docs/api/utils.schemas.config.html#classes",
-    "title": "utils.schemas.config",
+    "objectID": "docs/api/kernels.lora.html#functions",
+    "href": "docs/api/kernels.lora.html#functions",
+    "title": "kernels.lora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nWrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nWrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options."
+    "text": "Name\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weights, quantization state, LoRA A and B weights,\n\n\n\ntorch.Tensor | None\nscaling factor, and base layer bias. Quant state, weights, and bias may be\n\n\n\nQuantState | None\nNone if not available.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, b, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
   },
   {
-    "objectID": "docs/api/utils.schemas.enums.html",
-    "href": "docs/api/utils.schemas.enums.html",
-    "title": "utils.schemas.enums",
+    "objectID": "docs/api/utils.model_shard_quant.html",
+    "href": "docs/api/utils.model_shard_quant.html",
+    "title": "utils.model_shard_quant",
     "section": "",
-    "text": "utils.schemas.enums\nEnums for Axolotl input config\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations"
+    "text": "utils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
   },
   {
-    "objectID": "docs/api/utils.schemas.enums.html#classes",
-    "href": "docs/api/utils.schemas.enums.html#classes",
-    "title": "utils.schemas.enums",
+    "objectID": "docs/api/utils.model_shard_quant.html#functions",
+    "href": "docs/api/utils.model_shard_quant.html#functions",
+    "title": "utils.model_shard_quant",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations"
+    "text": "Name\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
   },
   {
-    "objectID": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
-    "title": "monkeypatch.btlm_attn_hijack_flash",
+    "objectID": "docs/api/utils.distributed.html",
+    "href": "docs/api/utils.distributed.html",
+    "title": "utils.distributed",
     "section": "",
-    "text": "monkeypatch.btlm_attn_hijack_flash\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model"
+    "text": "utils.distributed\nUtilities for distributed functionality.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbarrier\nActs as a barrier to wait for all processes. This ensures that all processes\n\n\ncleanup_distributed\nDestroy process group if torch distributed is initialized. Called in training early\n\n\ncompute_and_broadcast\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\n\n\ngather_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\ngather_scalar_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\nis_distributed\nCheck if distributed training is initialized.\n\n\nis_main_process\nCheck if the current process is the main process. If not in distributed mode,\n\n\nreduce_and_broadcast\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\n\n\nzero_first\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\n\n\n\nutils.distributed.barrier()\nActs as a barrier to wait for all processes. This ensures that all processes\nreach the barrier before proceeding further.\n\n\n\nutils.distributed.cleanup_distributed()\nDestroy process group if torch distributed is initialized. Called in training early\ntermination or when training successfully completes.\n\n\n\nutils.distributed.compute_and_broadcast(fn)\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\nThe value is then broadcasted to all other ranks.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that computes the value. Default is 0.\nReturns:\n- The computed value (int or float).\n\n\n\nutils.distributed.gather_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.gather_scalar_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.is_distributed()\nCheck if distributed training is initialized.\n\n\n\nutils.distributed.is_main_process()\nCheck if the current process is the main process. If not in distributed mode,\nalways return True.\nWe use a simpler logic when the distributed state is not initialized: we just log\non the 0-th local rank.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if the current process is the main process, False otherwise.\n\n\n\n\n\n\n\nutils.distributed.reduce_and_broadcast(fn1, fn2)\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\nand then broadcast the reduced result to all ranks.\nArgs:\n- fn1 (callable): A function that computes the value on each rank.\n- fn2 (callable): A reduction function that takes a list of values and returns a single value.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- The reduced and broadcasted value.\n\n\n\nutils.distributed.zero_first(is_main)\nruns the wrapped context so that rank 0 runs first before other ranks"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.chat_template.html",
-    "href": "docs/api/prompt_strategies.dpo.chat_template.html",
-    "title": "prompt_strategies.dpo.chat_template",
+    "objectID": "docs/api/utils.distributed.html#functions",
+    "href": "docs/api/utils.distributed.html#functions",
+    "title": "utils.distributed",
     "section": "",
-    "text": "prompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nDPO chat template strategy for argilla-style datasets.\n\n\n\n\n\nprompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)\nDPO chat template strategy for argilla-style datasets.\nFor argilla-style datasets where chosen/rejected contain full conversations\ninstead of single response messages. Extracts the conversation history from\nthe chosen field and formats both chosen/rejected responses using the\nconfigured chat template.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nConfiguration object containing chat_template and dataset settings\nrequired\n\n\ndataset_idx\n\nIndex of the dataset in the config (default: 0)\n0\n\n\n**kwargs\n\nAdditional keyword arguments (unused)\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ntuple\n\n(transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with ‘remove_columns’ specifying columns to drop\n\n\n\n\n\n\n{\n“chosen”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n],\n“rejected”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n]\n}"
+    "text": "Name\nDescription\n\n\n\n\nbarrier\nActs as a barrier to wait for all processes. This ensures that all processes\n\n\ncleanup_distributed\nDestroy process group if torch distributed is initialized. Called in training early\n\n\ncompute_and_broadcast\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\n\n\ngather_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\ngather_scalar_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\nis_distributed\nCheck if distributed training is initialized.\n\n\nis_main_process\nCheck if the current process is the main process. If not in distributed mode,\n\n\nreduce_and_broadcast\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\n\n\nzero_first\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\n\n\n\nutils.distributed.barrier()\nActs as a barrier to wait for all processes. This ensures that all processes\nreach the barrier before proceeding further.\n\n\n\nutils.distributed.cleanup_distributed()\nDestroy process group if torch distributed is initialized. Called in training early\ntermination or when training successfully completes.\n\n\n\nutils.distributed.compute_and_broadcast(fn)\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\nThe value is then broadcasted to all other ranks.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that computes the value. Default is 0.\nReturns:\n- The computed value (int or float).\n\n\n\nutils.distributed.gather_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.gather_scalar_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.is_distributed()\nCheck if distributed training is initialized.\n\n\n\nutils.distributed.is_main_process()\nCheck if the current process is the main process. If not in distributed mode,\nalways return True.\nWe use a simpler logic when the distributed state is not initialized: we just log\non the 0-th local rank.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if the current process is the main process, False otherwise.\n\n\n\n\n\n\n\nutils.distributed.reduce_and_broadcast(fn1, fn2)\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\nand then broadcast the reduced result to all ranks.\nArgs:\n- fn1 (callable): A function that computes the value on each rank.\n- fn2 (callable): A reduction function that takes a list of values and returns a single value.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- The reduced and broadcasted value.\n\n\n\nutils.distributed.zero_first(is_main)\nruns the wrapped context so that rank 0 runs first before other ranks"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.chat_template.html#functions",
-    "href": "docs/api/prompt_strategies.dpo.chat_template.html#functions",
-    "title": "prompt_strategies.dpo.chat_template",
+    "objectID": "docs/api/loaders.model.html",
+    "href": "docs/api/loaders.model.html",
+    "title": "loaders.model",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nDPO chat template strategy for argilla-style datasets.\n\n\n\n\n\nprompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)\nDPO chat template strategy for argilla-style datasets.\nFor argilla-style datasets where chosen/rejected contain full conversations\ninstead of single response messages. Extracts the conversation history from\nthe chosen field and formats both chosen/rejected responses using the\nconfigured chat template.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nConfiguration object containing chat_template and dataset settings\nrequired\n\n\ndataset_idx\n\nIndex of the dataset in the config (default: 0)\n0\n\n\n**kwargs\n\nAdditional keyword arguments (unused)\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ntuple\n\n(transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with ‘remove_columns’ specifying columns to drop\n\n\n\n\n\n\n{\n“chosen”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n],\n“rejected”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n]\n}"
+    "text": "loaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelLoader\nManages model configuration, initialization and application of patches during\n\n\n\n\n\nloaders.model.ModelLoader(\n    cfg,\n    tokenizer,\n    *,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nManages model configuration, initialization and application of patches during\nmodel loading.\nThis class orchestrates the entire process of loading a model from configuration to\nfinal preparation. It handles device mapping, quantization, attention mechanisms,\nadapter integration, and various optimizations.\n\n\n\nLoading and validating model configuration\nApplying monkey patches for optimizations / fixes\nSetting up device mapping (including multi-GPU configurations)\nConfiguring quantization\nSetting attention mechanisms (Flash Attention, SDPA, etc.)\nLoading and initializing the model\nApplying adapters (LoRA, QLoRA, etc.)\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nmodel\nPreTrainedModel | PeftModel | PeftMixedModel\nThe loaded model instance (available after load() is called).\n\n\nmodel_kwargs\ndict[str, Any]\nDictionary of keyword arguments passed to model initialization.\n\n\nbase_model\n\nName or path of the base model to load.\n\n\nmodel_type\n\nType of model to load (e.g., AutoModelForCausalLM).\n\n\nmodel_config\n\nConfiguration object for the model.\n\n\nauto_model_loader\n\nclass used for loading the model (default: AutoModelForCausalLM).\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\nloaders.model.ModelLoader.load()\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]\nA tuple with the loaded model and its LoRA configuration (if applicable)."
+  },
+  {
+    "objectID": "docs/api/loaders.model.html#classes",
+    "href": "docs/api/loaders.model.html#classes",
+    "title": "loaders.model",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nModelLoader\nManages model configuration, initialization and application of patches during\n\n\n\n\n\nloaders.model.ModelLoader(\n    cfg,\n    tokenizer,\n    *,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nManages model configuration, initialization and application of patches during\nmodel loading.\nThis class orchestrates the entire process of loading a model from configuration to\nfinal preparation. It handles device mapping, quantization, attention mechanisms,\nadapter integration, and various optimizations.\n\n\n\nLoading and validating model configuration\nApplying monkey patches for optimizations / fixes\nSetting up device mapping (including multi-GPU configurations)\nConfiguring quantization\nSetting attention mechanisms (Flash Attention, SDPA, etc.)\nLoading and initializing the model\nApplying adapters (LoRA, QLoRA, etc.)\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nmodel\nPreTrainedModel | PeftModel | PeftMixedModel\nThe loaded model instance (available after load() is called).\n\n\nmodel_kwargs\ndict[str, Any]\nDictionary of keyword arguments passed to model initialization.\n\n\nbase_model\n\nName or path of the base model to load.\n\n\nmodel_type\n\nType of model to load (e.g., AutoModelForCausalLM).\n\n\nmodel_config\n\nConfiguration object for the model.\n\n\nauto_model_loader\n\nclass used for loading the model (default: AutoModelForCausalLM).\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\nloaders.model.ModelLoader.load()\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]\nA tuple with the loaded model and its LoRA configuration (if applicable)."
+  },
+  {
+    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
+    "title": "monkeypatch.mistral_attn_hijack_flash",
+    "section": "",
+    "text": "monkeypatch.mistral_attn_hijack_flash\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model"
+  },
+  {
+    "objectID": "docs/api/evaluate.html",
+    "href": "docs/api/evaluate.html",
+    "title": "evaluate",
+    "section": "",
+    "text": "evaluate\nModule for evaluating models.\n\n\n\n\n\nName\nDescription\n\n\n\n\nevaluate\nEvaluate a model on training and validation datasets.\n\n\nevaluate_dataset\nHelper function to evaluate a single dataset.\n\n\n\n\n\nevaluate.evaluate(cfg, dataset_meta)\nEvaluate a model on training and validation datasets.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nDataset metadata containing training and evaluation datasets.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDict[str, float]\nDictionary mapping metric names to their values.\n\n\n\n\n\n\n\nevaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)\nHelper function to evaluate a single dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer instance.\nrequired\n\n\ndataset\nDataset\nDataset to evaluate.\nrequired\n\n\ndataset_type\nstr\nType of dataset (‘train’ or ‘eval’).\nrequired\n\n\nflash_optimum\nbool\nWhether to use flash optimum.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptional[Dict[str, float]]\nDictionary of metrics or None if dataset is None."
+  },
+  {
+    "objectID": "docs/api/evaluate.html#functions",
+    "href": "docs/api/evaluate.html#functions",
+    "title": "evaluate",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nevaluate\nEvaluate a model on training and validation datasets.\n\n\nevaluate_dataset\nHelper function to evaluate a single dataset.\n\n\n\n\n\nevaluate.evaluate(cfg, dataset_meta)\nEvaluate a model on training and validation datasets.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nDataset metadata containing training and evaluation datasets.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDict[str, float]\nDictionary mapping metric names to their values.\n\n\n\n\n\n\n\nevaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)\nHelper function to evaluate a single dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer instance.\nrequired\n\n\ndataset\nDataset\nDataset to evaluate.\nrequired\n\n\ndataset_type\nstr\nType of dataset (‘train’ or ‘eval’).\nrequired\n\n\nflash_optimum\nbool\nWhether to use flash optimum.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptional[Dict[str, float]]\nDictionary of metrics or None if dataset is None."
+  },
+  {
+    "objectID": "docs/api/cli.delinearize_llama4.html",
+    "href": "docs/api/cli.delinearize_llama4.html",
+    "title": "cli.delinearize_llama4",
+    "section": "",
+    "text": "cli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
+  },
+  {
+    "objectID": "docs/api/cli.delinearize_llama4.html#functions",
+    "href": "docs/api/cli.delinearize_llama4.html#functions",
+    "title": "cli.delinearize_llama4",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
+  },
+  {
+    "objectID": "docs/api/utils.trainer.html",
+    "href": "docs/api/utils.trainer.html",
+    "title": "utils.trainer",
+    "section": "",
+    "text": "utils.trainer\nModule containing the Trainer class and related functions\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\ndrop_long_seq\nDrop samples whose sequence length is either too long (&gt; sequence_len)\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.drop_long_seq(\n    sample,\n    sequence_len=2048,\n    min_sequence_len=2,\n    raise_on_drop=False,\n)\nDrop samples whose sequence length is either too long (&gt; sequence_len)\nor too short (&lt; min_sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\nIf raise_on_drop is set, the code raises a ValueError if a sample is\nencountered that is too long and would have been dropped.\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
+  },
+  {
+    "objectID": "docs/api/utils.trainer.html#functions",
+    "href": "docs/api/utils.trainer.html#functions",
+    "title": "utils.trainer",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\ndrop_long_seq\nDrop samples whose sequence length is either too long (&gt; sequence_len)\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.drop_long_seq(\n    sample,\n    sequence_len=2048,\n    min_sequence_len=2,\n    raise_on_drop=False,\n)\nDrop samples whose sequence length is either too long (&gt; sequence_len)\nor too short (&lt; min_sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\nIf raise_on_drop is set, the code raises a ValueError if a sample is\nencountered that is too long and would have been dropped.\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
+  },
+  {
+    "objectID": "docs/api/cli.quantize.html",
+    "href": "docs/api/cli.quantize.html",
+    "title": "cli.quantize",
+    "section": "",
+    "text": "cli.quantize\nCLI to post-training quantize a model using torchao\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_quantize\nQuantizes a model’s model’s weights\n\n\n\n\n\ncli.quantize.do_quantize(config, cli_args)\nQuantizes a model’s model’s weights\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nThe path to the config file\nrequired\n\n\ncli_args\ndict\nAdditional command-line arguments\nrequired"
+  },
+  {
+    "objectID": "docs/api/cli.quantize.html#functions",
+    "href": "docs/api/cli.quantize.html#functions",
+    "title": "cli.quantize",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\ndo_quantize\nQuantizes a model’s model’s weights\n\n\n\n\n\ncli.quantize.do_quantize(config, cli_args)\nQuantizes a model’s model’s weights\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nThe path to the config file\nrequired\n\n\ncli_args\ndict\nAdditional command-line arguments\nrequired"
+  },
+  {
+    "objectID": "docs/api/common.const.html",
+    "href": "docs/api/common.const.html",
+    "title": "common.const",
+    "section": "",
+    "text": "common.const\ncommon.const\nVarious shared constants"
+  },
+  {
+    "objectID": "docs/api/prompt_strategies.llama2_chat.html",
+    "href": "docs/api/prompt_strategies.llama2_chat.html",
+    "title": "prompt_strategies.llama2_chat",
+    "section": "",
+    "text": "prompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\nsee also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.\nThis implementation is based on the Vicuna PR and the fastchat repo, see also:\nhttps://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847\nUse dataset type: “llama2_chat” in conig.yml to use this prompt style.\nE.g. in the config.yml:\ndatasets:\n  - path: llama_finetune_train.jsonl\n    type: llama2_chat\nThe dataset itself should look like this:\n{'conversations':[{\"from\": \"human\", \"value\": \"Who are you?\"}, {\"from\": \"gpt\", \"value\": \"I am Vicuna\"},...]}\nin a jsonl file. The first message should be from the human, the second from gpt.\nFor a custom system message, the first “from” can be “system” (followed by alternating “human” and “gpt” turns).\nImportant: Don’t use “special_tokens:” in your config.yml if you are not sure what you are doing!\n\n\n\n\n\nName\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
+  },
+  {
+    "objectID": "docs/api/prompt_strategies.llama2_chat.html#classes",
+    "href": "docs/api/prompt_strategies.llama2_chat.html#classes",
+    "title": "prompt_strategies.llama2_chat",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
+  },
+  {
+    "objectID": "docs/api/utils.collators.mm_chat.html",
+    "href": "docs/api/utils.collators.mm_chat.html",
+    "title": "utils.collators.mm_chat",
+    "section": "",
+    "text": "utils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
+  },
+  {
+    "objectID": "docs/api/utils.collators.mm_chat.html#classes",
+    "href": "docs/api/utils.collators.mm_chat.html#classes",
+    "title": "utils.collators.mm_chat",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
+  },
+  {
+    "objectID": "docs/api/core.datasets.chat.html",
+    "href": "docs/api/core.datasets.chat.html",
+    "title": "core.datasets.chat",
+    "section": "",
+    "text": "core.datasets.chat\nchat dataset module\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
+  },
+  {
+    "objectID": "docs/api/core.datasets.chat.html#classes",
+    "href": "docs/api/core.datasets.chat.html#classes",
+    "title": "core.datasets.chat",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
+  },
+  {
+    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html",
+    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html",
+    "title": "prompt_strategies.bradley_terry.llama3",
+    "section": "",
+    "text": "prompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
+  },
+  {
+    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
+    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
+    "title": "prompt_strategies.bradley_terry.llama3",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
+  },
+  {
+    "objectID": "docs/api/core.trainers.mamba.html",
+    "href": "docs/api/core.trainers.mamba.html",
+    "title": "core.trainers.mamba",
+    "section": "",
+    "text": "core.trainers.mamba\nModule for mamba trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
+  },
+  {
+    "objectID": "docs/api/core.trainers.mamba.html#classes",
+    "href": "docs/api/core.trainers.mamba.html#classes",
+    "title": "core.trainers.mamba",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
+  },
+  {
+    "objectID": "docs/api/core.builders.causal.html",
+    "href": "docs/api/core.builders.causal.html",
+    "title": "core.builders.causal",
+    "section": "",
+    "text": "core.builders.causal\nBuilder for causal trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
+  },
+  {
+    "objectID": "docs/api/core.builders.causal.html#classes",
+    "href": "docs/api/core.builders.causal.html#classes",
+    "title": "core.builders.causal",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
+  },
+  {
+    "objectID": "docs/api/kernels.geglu.html",
+    "href": "docs/api/kernels.geglu.html",
+    "title": "kernels.geglu",
+    "section": "",
+    "text": "kernels.geglu\nModule for definition of GEGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
+  },
+  {
+    "objectID": "docs/api/kernels.geglu.html#functions",
+    "href": "docs/api/kernels.geglu.html#functions",
+    "title": "kernels.geglu",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
+  },
+  {
+    "objectID": "docs/api/utils.schemas.utils.html",
+    "href": "docs/api/utils.schemas.utils.html",
+    "title": "utils.schemas.utils",
+    "section": "",
+    "text": "utils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
+  },
+  {
+    "objectID": "docs/api/utils.schemas.utils.html#functions",
+    "href": "docs/api/utils.schemas.utils.html#functions",
+    "title": "utils.schemas.utils",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
+  },
+  {
+    "objectID": "docs/custom_integrations.html",
+    "href": "docs/custom_integrations.html",
+    "title": "Custom Integrations",
+    "section": "",
+    "text": "Axolotl adds custom features through integrations. They are located within the src/axolotl/integrations directory.\nTo enable them, please check the respective documentations.",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#cut-cross-entropy",
+    "href": "docs/custom_integrations.html#cut-cross-entropy",
+    "title": "Custom Integrations",
+    "section": "Cut Cross Entropy",
+    "text": "Cut Cross Entropy\nCut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.\nSee https://github.com/apple/ml-cross-entropy\n\nRequirements\n\nPyTorch 2.4.0 or higher\n\n\n\nInstallation\nRun the following command to install cut_cross_entropy[transformers] if you don’t have it already.\n\nIf you are in dev environment\n\npython scripts/cutcrossentropy_install.py | sh\n\nIf you are installing from pip\n\npip3 uninstall -y cut-cross-entropy && pip3 install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b\"\n\n\nUsage\nplugins:\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n\nSupported Models\n\napertus\narcee\ncohere\ncohere2\ndeepseek_v3\nexaone4\ngemma\ngemma2\ngemma3\ngemma3_text\ngemma3n\ngemma3n_text\nglm\nglm4\nglm4_moe\nglm4_moe_lite\nglm46v\nglm4v\nglm4v_moe\nglm_image\ngpt_oss\ngranite\ngranitemoe\ngranitemoehybrid\ngranitemoeshared\nhunyuan_v1_dense\nhunyuan_v1_moe\ninternvl\nkimi_linear\nlfm2\nlfm2_moe\nlfm2_vl\nllama\nllama4\nllama4_text\nllava\nministral\nministral3\nmistral\nmistral3\nmixtral\nmllama\nolmo\nolmo2\nolmo3\nphi\nphi3\nphi4_multimodal\nqwen2\nqwen2_moe\nqwen2_vl\nqwen2_5_vl\nqwen3\nqwen3_moe\nqwen3_next\nqwen3_vl\nqwen3_vl_moe\nseed_oss\nsmollm3\nstep3p5\nvoxtral\n\n\n\nCitation\n@article{wijmans2024cut,\n  author       = {Erik Wijmans and\n                  Brody Huval and\n                  Alexander Hertzberg and\n                  Vladlen Koltun and\n                  Philipp Kr\\\"ahenb\\\"uhl},\n  title        = {Cut Your Losses in Large-Vocabulary Language Models},\n  journal      = {arXiv},\n  year         = {2024},\n  url          = {https://arxiv.org/abs/2411.09009},\n}\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#densemixer",
+    "href": "docs/custom_integrations.html#densemixer",
+    "title": "Custom Integrations",
+    "section": "DenseMixer",
+    "text": "DenseMixer\nSee DenseMixer\nSimply add the following to your axolotl YAML config:\nplugins:\n  - axolotl.integrations.densemixer.DenseMixerPlugin\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#diffusion-lm-training-plugin-for-axolotl",
+    "href": "docs/custom_integrations.html#diffusion-lm-training-plugin-for-axolotl",
+    "title": "Custom Integrations",
+    "section": "Diffusion LM Training Plugin for Axolotl",
+    "text": "Diffusion LM Training Plugin for Axolotl\nThis plugin enables diffusion language model training using an approach inspired by\nLLaDA (Large Language Diffusion Models) within Axolotl.\n\nOverview\nLLaDA is a diffusion-based approach to language model training that uses:\n- Random token masking during training instead of next-token prediction\n- Bidirectional attention to allow the model to attend to the full context\n- Importance weighting based on masking probabilities for stable training\nThis approach can lead to more robust language models with better understanding of\nbidirectional context.\n\n\nInstallation\nThe plugin is included with Axolotl. See our\ninstallation docs.\n\n\nQuickstart\nTrain with an example config (Llama‑3.2 1B):\n- Pretrain: axolotl train examples/llama-3/diffusion-3.2-1b-pretrain.yaml\n- SFT: axolotl train examples/llama-3/diffusion-3.2-1b-sft.yaml\n\n\nBasic Configuration\nYou can also modify your existing configs to enable / customize diffusion training.\nAdd the following to your Axolotl config:\nplugins:\n  - axolotl.integrations.diffusion.DiffusionPlugin\nAnd, configure the nested diffusion block (defaults shown):\ndiffusion:\n  noise_schedule: linear  # or \"cosine\"\n  min_mask_ratio: 0.1\n  max_mask_ratio: 0.9\n  num_diffusion_steps: 128\n  eps: 1e-3\n  importance_weighting: true\n\n  # Mask token (training auto-adds if missing, avoid pad/eos)\n  mask_token_str: \"&lt;|diffusion_mask|&gt;\"\n  # Or use an existing special token id (e.g., 128002 for Llama-3.x)\n  # mask_token_id: 128002\n\n  # Sample generation during training (optional)\n  generate_samples: true\n  generation_interval: 100\n  num_generation_samples: 3\n  generation_steps: 128\n  generation_temperature: 0.0\n  generation_max_length: 100\n\n\nSupported Models\nAny models that support 4D attention masks should work out of the box. If not, please\ncreate an issue or open a\nPR!\n\n\nHow It Works\n\n\nRandom Masking\nDuring training, tokens are randomly masked:\n- Sample timestep t uniformly from [0, 1]\n- Calculate masking probability: p = (1 - eps) * t + eps\n- Randomly mask tokens with probability p\n\n\nDiffusion Loss\nLoss is computed only on masked tokens with (optional) importance weighting:\nloss = sum(cross_entropy(pred, target) / p_mask) / total_tokens\n\n\nSample Generation\nWhen diffusion.generate_samples: true, the plugin generates samples during training:\nSample 1:\n   Original (45 tokens): The quick brown fox jumps over the lazy dog...\n   Masked (18/45 tokens, 40.0%): The [MASK] [MASK] fox [MASK] over [MASK] lazy [MASK]...\n   Generated: The quick brown fox jumps over the lazy dog...\nSamples are logged to console and wandb (if enabled).\n\n\nInference\nDiffusion inference is integrated into the standard Axolotl CLI. Use the same config\nyou trained with and run:\naxolotl inference path/to/your-config.yaml\nOptionally, pass --gradio to use a simple web interface.\nInteractive controls (prefix the prompt with commands):\n- :complete N → completion mode with N new masked tokens appended (default 64)\n- :mask R → random masking mode with target mask ratio R in [0.0, 1.0]\nExample session:\n================================================================================\nCommands:\n:complete N -&gt; completion mode with N tokens (default 64)\n:mask R     -&gt; random masking with ratio R (0.0–1.0)\n================================================================================\nGive me an instruction (Ctrl + D to submit):\n\n:mask 0.4 The quick brown fox jumps over the lazy dog\n\nMasked (40.0%):\nThe [MASK] brown [MASK] jumps over the [MASK] dog\n\nGenerated:\nThe quick brown fox jumps over the loud dog\n\n\nMetrics and Monitoring\nThe plugin adds (or modifies) several metrics to track diffusion training:\n\ntrain/loss: Weighted diffusion loss\ntrain/accuracy: Accuracy on masked tokens\ntrain/mask_ratio: Average fraction of tokens masked\ntrain/num_masked_tokens: Number of tokens masked\ntrain/avg_p_mask: Average masking probability\ntrain/ce_loss: Unweighted cross-entropy loss\ntrain/importance_weight_avg: Average importance weight\n\n\n\nLimitations\n\nNo flash attention support\nNo RL training support\n\n\n\nReferences\n\nLLaDA Paper\nAxolotl Documentation\nAPI reference for plugin\n\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#grokfast",
+    "href": "docs/custom_integrations.html#grokfast",
+    "title": "Custom Integrations",
+    "section": "Grokfast",
+    "text": "Grokfast\nSee https://github.com/ironjr/grokfast\n\nUsage\nplugins:\n  - axolotl.integrations.grokfast.GrokfastPlugin\n\ngrokfast_alpha: 2.0\ngrokfast_lamb: 0.98\n\n\nCitation\n@article{lee2024grokfast,\n    title={{Grokfast}: Accelerated Grokking by Amplifying Slow Gradients},\n    author={Lee, Jaerin and Kang, Bong Gyun and Kim, Kihoon and Lee, Kyoung Mu},\n    journal={arXiv preprint arXiv:2405.20233},\n    year={2024}\n}\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#knowledge-distillation-kd",
+    "href": "docs/custom_integrations.html#knowledge-distillation-kd",
+    "title": "Custom Integrations",
+    "section": "Knowledge Distillation (KD)",
+    "text": "Knowledge Distillation (KD)\n\nUsage\nplugins:\n  - \"axolotl.integrations.kd.KDPlugin\"\n\nkd_trainer: True\nkd_ce_alpha: 0.1\nkd_alpha: 0.9\nkd_temperature: 1.0\n\ntorch_compile: True  # torch&gt;=2.6.0, recommended to reduce vram\n\ndatasets:\n  - path: ...\n    type: \"axolotl.integrations.kd.chat_template\"\n    field_messages: \"messages_combined\"\n    logprobs_field: \"llm_text_generation_vllm_logprobs\"  # for kd only, field of logprobs\nAn example dataset can be found at axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#llmcompressor",
+    "href": "docs/custom_integrations.html#llmcompressor",
+    "title": "Custom Integrations",
+    "section": "LLMCompressor",
+    "text": "LLMCompressor\nFine-tune sparsified models in Axolotl using Neural Magic’s LLMCompressor.\nThis integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor’s model compression capabilities with Axolotl’s distributed training pipelines, users can efficiently fine-tune sparse models at scale.\nIt uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.\n\n\nRequirements\n\nAxolotl with llmcompressor extras:\npip install \"axolotl[llmcompressor]\"\nRequires llmcompressor &gt;= 0.5.1\n\nThis will install all necessary dependencies to fine-tune sparsified models using the integration.\n\n\n\nUsage\nTo enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:\nplugins:\n  - axolotl.integrations.llm_compressor.LLMCompressorPlugin\n\nllmcompressor:\n  recipe:\n    finetuning_stage:\n      finetuning_modifiers:\n        ConstantPruningModifier:\n          targets: [\n            're:.*q_proj.weight',\n            're:.*k_proj.weight',\n            're:.*v_proj.weight',\n            're:.*o_proj.weight',\n            're:.*gate_proj.weight',\n            're:.*up_proj.weight',\n            're:.*down_proj.weight',\n          ]\n          start: 0\n  save_compressed: true\nThis plugin does not apply pruning or sparsification itself — it is intended for fine-tuning models that have already been sparsified.\nPre-sparsified checkpoints can be:\n- Generated using LLMCompressor\n- Downloaded from Neural Magic’s Hugging Face page\n- Any custom LLM with compatible sparsity patterns that you’ve created yourself\nTo learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:\nhttps://github.com/vllm-project/llm-compressor/blob/main/README.md\n\n\nStorage Optimization with save_compressed\nSetting save_compressed: true in your configuration enables saving models in a compressed format, which:\n- Reduces disk space usage by approximately 40%\n- Maintains compatibility with vLLM for accelerated inference\n- Maintains compatibility with llmcompressor for further optimization (example: quantization)\nThis option is highly recommended when working with sparse models to maximize the benefits of model compression.\n\n\nExample Config\nSee examples/llama-3/sparse-finetuning.yaml for a complete example.\n\n\n\nInference with vLLM\nAfter fine-tuning your sparse model, you can leverage vLLM for efficient inference.\nYou can also use LLMCompressor to apply additional quantization to your fine-tuned\nsparse model before inference for even greater performance benefits.:\nfrom vllm import LLM, SamplingParams\n\nprompts = [\n    \"Hello, my name is\",\n    \"The president of the United States is\",\n    \"The capital of France is\",\n    \"The future of AI is\",\n]\nsampling_params = SamplingParams(temperature=0.8, top_p=0.95)\nllm = LLM(\"path/to/your/sparse/model\")\noutputs = llm.generate(prompts, sampling_params)\n\nfor output in outputs:\n    prompt = output.prompt\n    generated_text = output.outputs[0].text\n    print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\nFor more details on vLLM’s capabilities and advanced configuration options, see the official vLLM documentation.\n\n\nLearn More\nFor details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:\nhttps://github.com/vllm-project/llm-compressor\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
+    "href": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
+    "title": "Custom Integrations",
+    "section": "Language Model Evaluation Harness (LM Eval)",
+    "text": "Language Model Evaluation Harness (LM Eval)\nRun evaluation on model using the popular lm-evaluation-harness library.\nSee https://github.com/EleutherAI/lm-evaluation-harness\n\nUsage\nplugins:\n  - axolotl.integrations.lm_eval.LMEvalPlugin\n\nlm_eval_tasks:\n  - gsm8k\n  - hellaswag\n  - arc_easy\n\nlm_eval_batch_size: # Batch size for evaluation\noutput_dir: # Directory to save evaluation results\n\n\nCitation\n@misc{eval-harness,\n  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},\n  title        = {A framework for few-shot language model evaluation},\n  month        = 07,\n  year         = 2024,\n  publisher    = {Zenodo},\n  version      = {v0.4.3},\n  doi          = {10.5281/zenodo.12608602},\n  url          = {https://zenodo.org/records/12608602}\n}\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#liger-kernels",
+    "href": "docs/custom_integrations.html#liger-kernels",
+    "title": "Custom Integrations",
+    "section": "Liger Kernels",
+    "text": "Liger Kernels\nLiger Kernel provides efficient Triton kernels for LLM training, offering:\n\n20% increase in multi-GPU training throughput\n60% reduction in memory usage\nCompatibility with both FSDP and DeepSpeed\n\nSee https://github.com/linkedin/Liger-Kernel\n\nUsage\nplugins:\n  - axolotl.integrations.liger.LigerPlugin\nliger_rope: true\nliger_rms_norm: true\nliger_glu_activation: true\nliger_layer_norm: true\nliger_fused_linear_cross_entropy: true\n\nliger_use_token_scaling: true\n\n\nSupported Models\n\ndeepseek_v2\ngemma\ngemma2\ngemma3\ngranite\njamba\nllama\nmistral\nmixtral\nmllama\nmllama_text_model\nolmo2\npaligemma\nphi3\nqwen2\nqwen2_5_vl\nqwen2_vl\n\n\n\nCitation\n@article{hsu2024ligerkernelefficienttriton,\n      title={Liger Kernel: Efficient Triton Kernels for LLM Training},\n      author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},\n      year={2024},\n      eprint={2410.10989},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2410.10989},\n      journal={arXiv preprint arXiv:2410.10989},\n}\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#spectrum",
+    "href": "docs/custom_integrations.html#spectrum",
+    "title": "Custom Integrations",
+    "section": "Spectrum",
+    "text": "Spectrum\nby Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar\nThis plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR).\nSee https://github.com/cognitivecomputations/spectrum\n\nOverview\nSpectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models.\nBy identifying the top n% of layers with the highest SNR, you can optimize training efficiency.\n\n\nUsage\nplugins:\n  - axolotl.integrations.spectrum.SpectrumPlugin\n\nspectrum_top_fraction: 0.5\nspectrum_model_name: meta-llama/Meta-Llama-3.1-8B\n\n\nCitation\n@misc{hartford2024spectrumtargetedtrainingsignal,\n      title={Spectrum: Targeted Training on Signal to Noise Ratio},\n      author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},\n      year={2024},\n      eprint={2406.06623},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2406.06623},\n}\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#swanlab-integration-for-axolotl",
+    "href": "docs/custom_integrations.html#swanlab-integration-for-axolotl",
+    "title": "Custom Integrations",
+    "section": "SwanLab Integration for Axolotl",
+    "text": "SwanLab Integration for Axolotl\nSwanLab is an open-source, lightweight AI experiment tracking and visualization tool that provides a platform for tracking, recording, comparing, and collaborating on experiments.\nThis integration enables seamless experiment tracking and visualization of Axolotl training runs using SwanLab.\n\nFeatures\n\n📊 Automatic Metrics Logging: Training loss, learning rate, and other metrics are automatically logged\n🎯 Hyperparameter Tracking: Model configuration and training parameters are tracked\n📈 Real-time Visualization: Monitor training progress in real-time through SwanLab dashboard\n☁️ Cloud & Local Support: Works in both cloud-synced and offline modes\n🔄 Experiment Comparison: Compare multiple training runs easily\n🤝 Team Collaboration: Share experiments with team members\n🎭 RLHF Completion Logging: Automatically log model outputs during DPO/KTO/ORPO/GRPO training for qualitative analysis\n⚡ Performance Profiling: Built-in profiling decorators to measure and optimize training performance\n🔔 Lark Notifications: Send real-time training updates to team chat (Feishu/Lark integration)\n\n\n\nInstallation\npip install swanlab\n\n\nQuick Start\n\n\n1. Register for SwanLab (Optional for cloud mode)\nIf you want to use cloud sync features, register at https://swanlab.cn to get your API key.\n\n\n2. Configure Axolotl Config File\nAdd SwanLab configuration to your Axolotl YAML config:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: my-llm-project\nswanlab_experiment_name: qwen-finetune-v1\nswanlab_mode: cloud  # Options: cloud, local, offline, disabled\nswanlab_workspace: my-team  # Optional: organization name\nswanlab_api_key: YOUR_API_KEY  # Optional: can also use env var SWANLAB_API_KEY\n\n\n3. Run Training\nexport SWANLAB_API_KEY=your-api-key-here\n\nswanlab login\n\naccelerate launch -m axolotl.cli.train your-config.yaml\n\n\nConfiguration Options\n\n\nBasic Configuration\n\n\n\n\n\n\n\n\n\nParameter\nType\nDefault\nDescription\n\n\n\n\nuse_swanlab\nbool\nfalse\nEnable SwanLab tracking\n\n\nswanlab_project\nstr\nNone\nProject name (required)\n\n\nswanlab_experiment_name\nstr\nNone\nExperiment name\n\n\nswanlab_description\nstr\nNone\nExperiment description\n\n\nswanlab_mode\nstr\ncloud\nSync mode: cloud, local, offline, disabled\n\n\n\n\n\nAdvanced Configuration\n\n\n\n\n\n\n\n\n\nParameter\nType\nDefault\nDescription\n\n\n\n\nswanlab_workspace\nstr\nNone\nWorkspace/organization name\n\n\nswanlab_api_key\nstr\nNone\nAPI key (prefer env var)\n\n\nswanlab_web_host\nstr\nNone\nPrivate deployment web host\n\n\nswanlab_api_host\nstr\nNone\nPrivate deployment API host\n\n\nswanlab_log_model\nbool\nfalse\nLog model checkpoints (coming soon)\n\n\nswanlab_lark_webhook_url\nstr\nNone\nLark (Feishu) webhook URL for team notifications\n\n\nswanlab_lark_secret\nstr\nNone\nLark webhook HMAC secret for authentication\n\n\nswanlab_log_completions\nbool\ntrue\nEnable RLHF completion table logging (DPO/KTO/ORPO/GRPO)\n\n\nswanlab_completion_log_interval\nint\n100\nSteps between completion logging\n\n\nswanlab_completion_max_buffer\nint\n128\nMax completions to buffer (memory bound)\n\n\n\n\n\nConfiguration Examples\n\n\nExample 1: Basic Cloud Sync\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: llama-finetune\nswanlab_experiment_name: llama-3-8b-instruct-v1\nswanlab_mode: cloud\n\n\nExample 2: Offline/Local Mode\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: local-experiments\nswanlab_experiment_name: test-run-1\nswanlab_mode: local  # or 'offline'\n\n\nExample 3: Team Workspace\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: research-project\nswanlab_experiment_name: experiment-42\nswanlab_workspace: my-research-team\nswanlab_mode: cloud\n\n\nExample 4: Private Deployment\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: internal-project\nswanlab_experiment_name: secure-training\nswanlab_mode: cloud\nswanlab_web_host: https://swanlab.yourcompany.com\nswanlab_api_host: https://api.swanlab.yourcompany.com\n\n\nTeam Notifications with Lark (Feishu)\nSwanLab supports sending real-time training notifications to your team chat via Lark (Feishu), ByteDance’s enterprise collaboration platform. This is especially useful for:\n- Production training monitoring: Get alerts when training starts, completes, or encounters errors\n- Team collaboration: Keep your ML team informed about long-running experiments\n- Multi-timezone teams: Team members can check training progress without being online\n\n\nPrerequisites\n\nLark Bot Setup: Create a custom bot in your Lark group chat\nWebhook URL: Get the webhook URL from your Lark bot settings\nHMAC Secret (recommended): Enable signature verification in your Lark bot for security\n\nFor detailed Lark bot setup instructions, see Lark Custom Bot Documentation.\n\n\nExample 5: Basic Lark Notifications\nSend training notifications to a Lark group chat:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: production-training\nswanlab_experiment_name: llama-3-finetune-v2\nswanlab_mode: cloud\n\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx\nNote: This configuration will work, but you’ll see a security warning recommending HMAC secret configuration.\n\n\nExample 6: Lark Notifications with HMAC Security (Recommended)\nFor production use, enable HMAC signature verification:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: production-training\nswanlab_experiment_name: llama-3-finetune-v2\nswanlab_mode: cloud\n\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx\nswanlab_lark_secret: your-webhook-secret-key\nWhy HMAC secret matters:\n- Prevents unauthorized parties from sending fake notifications to your Lark group\n- Ensures notifications genuinely come from your training jobs\n- Required for production deployments with sensitive training data\n\n\nExample 7: Team Workspace + Lark Notifications\nCombine team workspace collaboration with Lark notifications:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: research-project\nswanlab_experiment_name: multimodal-experiment-42\nswanlab_workspace: ml-research-team\nswanlab_mode: cloud\n\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx\nswanlab_lark_secret: your-webhook-secret-key\n\n\nWhat Notifications Are Sent?\nSwanLab’s Lark integration sends notifications for key training events:\n- Training Start: When your experiment begins\n- Training Complete: When training finishes successfully\n- Training Errors: If training crashes or encounters critical errors\n- Metric Milestones: Configurable alerts for metric thresholds (if configured in SwanLab)\nEach notification includes:\n- Experiment name and project\n- Training status\n- Key metrics (loss, learning rate)\n- Direct link to SwanLab dashboard\n\n\nLark Configuration Validation\nThe plugin validates your Lark configuration at startup:\n\n✅ Valid Configurations\nuse_swanlab: true\nswanlab_project: my-project\n\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx\nswanlab_lark_secret: your-secret\n\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx\n\n\n\nSecurity Best Practices\n\nAlways use HMAC secret in production:\nswanlab_lark_webhook_url: https://open.feishu.cn/...\nswanlab_lark_secret: your-secret-key  # ✅ Add this!\nStore secrets in environment variables (even better):\n# In your training script/environment\nexport SWANLAB_LARK_WEBHOOK_URL=\"https://open.feishu.cn/...\"\nexport SWANLAB_LARK_SECRET=\"your-secret-key\"\nThen in config:\n# SwanLab plugin will auto-detect environment variables\nuse_swanlab: true\nswanlab_project: my-project\n# Lark URL and secret read from env vars\nRotate webhook secrets periodically: Update your Lark bot’s secret every 90 days\nUse separate webhooks for dev/prod: Don’t mix development and production notifications\n\n\n\nDistributed Training\nLark notifications are automatically deduplicated in distributed training:\n- Only rank 0 sends notifications\n- Other GPU ranks skip Lark registration\n- Prevents duplicate messages in multi-GPU training\ntorchrun --nproc_per_node=4 -m axolotl.cli.train config.yml\n\n\nRLHF Completion Table Logging\nFor RLHF (Reinforcement Learning from Human Feedback) training methods like DPO, KTO, ORPO, and GRPO, SwanLab can log model completions (prompts, chosen/rejected responses, rewards) to a visual table for qualitative analysis. This helps you:\n\nInspect model behavior: See actual model outputs during training\nDebug preference learning: Compare chosen vs rejected responses\nTrack reward patterns: Monitor how rewards evolve over training\nShare examples with team: Visual tables in SwanLab dashboard\n\n\n\nFeatures\n\n✅ Automatic detection: Works with DPO, KTO, ORPO, GRPO trainers\n✅ Memory-safe buffering: Bounded buffer prevents memory leaks in long training runs\n✅ Periodic logging: Configurable logging interval to reduce overhead\n✅ Rich visualization: SwanLab tables show prompts, responses, and metrics side-by-side\n\n\n\nConfiguration\n\n\n\n\n\n\n\n\n\nParameter\nType\nDefault\nDescription\n\n\n\n\nswanlab_log_completions\nbool\ntrue\nEnable completion logging for RLHF trainers\n\n\nswanlab_completion_log_interval\nint\n100\nLog completions to SwanLab every N training steps\n\n\nswanlab_completion_max_buffer\nint\n128\nMaximum completions to buffer (memory bound)\n\n\n\n\n\nExample: DPO Training with Completion Logging\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: dpo-training\nswanlab_experiment_name: llama-3-dpo-v1\nswanlab_mode: cloud\n\nswanlab_log_completions: true\nswanlab_completion_log_interval: 100  # Log every 100 steps\nswanlab_completion_max_buffer: 128    # Keep last 128 completions\n\nrl: dpo\ndatasets:\n  - path: /path/to/preference_dataset\n    type: chatml.intel\n\n\nExample: Disable Completion Logging\nIf you’re doing a quick test run or don’t need completion tables:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: dpo-training\n\nswanlab_log_completions: false\n\n\nSupported RLHF Trainers\nThe completion logging callback automatically activates for these trainer types:\n\nDPO (Direct Preference Optimization): Logs prompts, chosen, rejected, reward_diff\nKTO (Kahneman-Tversky Optimization): Logs prompts, completions, labels, rewards\nORPO (Odds Ratio Preference Optimization): Logs prompts, chosen, rejected, log_odds_ratio\nGRPO (Group Relative Policy Optimization): Logs prompts, completions, rewards, advantages\nCPO (Constrained Policy Optimization): Logs prompts, chosen, rejected\n\nFor non-RLHF trainers (standard supervised fine-tuning), the completion callback is automatically skipped.\n\n\nHow It Works\n\nAuto-detection: Plugin detects trainer type at initialization\nBuffering: Completions are buffered in memory (up to swanlab_completion_max_buffer)\nPeriodic logging: Every swanlab_completion_log_interval steps, buffer is logged to SwanLab\nMemory safety: Old completions are automatically dropped when buffer is full (uses collections.deque)\nFinal flush: Remaining completions are logged when training completes\n\n\n\nViewing Completion Tables\nAfter training starts, you can view completion tables in your SwanLab dashboard:\n\nNavigate to your experiment in SwanLab\nLook for the “rlhf_completions” table in the metrics panel\nThe table shows:\n\nstep: Training step when completion was generated\nprompt: Input prompt\nchosen: Preferred response (DPO/ORPO)\nrejected: Non-preferred response (DPO/ORPO)\ncompletion: Model output (KTO/GRPO)\nreward_diff/reward: Reward metrics\nTrainer-specific metrics (e.g., log_odds_ratio for ORPO)\n\n\n\n\nMemory Management\nThe completion buffer is memory-bounded to prevent memory leaks:\nfrom collections import deque\n\nbuffer = deque(maxlen=128)  # Old completions automatically dropped\nMemory usage estimate:\n- Average completion: ~500 characters (prompt + responses)\n- Buffer size 128: ~64 KB (negligible)\n- Buffer size 1024: ~512 KB (still small)\nRecommendation: Default buffer size (128) works well for most cases. Increase to 512-1024 only if you need to review more historical completions.\n\n\nPerformance Impact\nCompletion logging has minimal overhead:\n\nBuffering: O(1) append operation, negligible CPU/memory\nLogging: Only happens every N steps (default: 100)\nNetwork: SwanLab batches table uploads efficiently\n\nExpected overhead: &lt; 0.5% per training step\n\n\nTroubleshooting\n\nCompletions not appearing in SwanLab\nCause: Trainer may not be logging completion data in the expected format.\nDiagnostic steps:\n1. Check trainer type detection in logs:\ntext    INFO: SwanLab RLHF completion logging enabled for DPOTrainer (type: dpo)\n2. Verify your trainer is an RLHF trainer (DPO/KTO/ORPO/GRPO)\n3. Check if trainer logs completion data (this depends on TRL version)\nNote: The current implementation expects trainers to log completion data in the logs dict during on_log() callback. Some TRL trainers may not expose this data by default. You may need to patch the trainer to expose completions.\n\n\nBuffer fills up too quickly\nCause: High logging frequency with small buffer size.\nSolution: Increase buffer size or logging interval:\nswanlab_completion_log_interval: 200  # Log less frequently\nswanlab_completion_max_buffer: 512    # Larger buffer\n\n\nMemory usage growing over time\nCause: Buffer should be bounded, so this indicates a bug.\nSolution:\n1. Verify swanlab_completion_max_buffer is set\n2. Check SwanLab version is up to date\n3. Report issue with memory profiling data\n\n\n\nPerformance Profiling\nSwanLab integration includes profiling utilities to measure and log execution time of trainer methods. This helps you:\n\nIdentify bottlenecks: Find slow operations in your training loop\nOptimize performance: Track improvements after optimization changes\nMonitor distributed training: See per-rank timing differences\nDebug hangs: Detect methods that take unexpectedly long\n\n\n\nFeatures\n\n✅ Zero-config profiling: Automatic timing of key trainer methods\n✅ Decorator-based: Easy to add profiling to custom methods with @swanlab_profile\n✅ Context manager: Fine-grained profiling with swanlab_profiling_context()\n✅ Advanced filtering: ProfilingConfig for throttling and minimum duration thresholds\n✅ Exception-safe: Logs duration even if function raises an exception\n\n\n\nBasic Usage: Decorator\nAdd profiling to any trainer method with the @swanlab_profile decorator:\nfrom axolotl.integrations.swanlab.profiling import swanlab_profile\n\nclass MyCustomTrainer(AxolotlTrainer):\n    @swanlab_profile\n    def training_step(self, model, inputs):\n        # Your training step logic\n        return super().training_step(model, inputs)\n\n    @swanlab_profile\n    def prediction_step(self, model, inputs, prediction_loss_only):\n        # Your prediction logic\n        return super().prediction_step(model, inputs, prediction_loss_only)\nThe decorator automatically:\n1. Measures execution time with high-precision timer\n2. Logs to SwanLab as profiling/Time taken: ClassName.method_name\n3. Only logs if SwanLab is enabled (use_swanlab: true)\n4. Gracefully handles exceptions (logs duration, then re-raises)\n\n\nAdvanced Usage: Context Manager\nFor fine-grained profiling within a method:\nfrom axolotl.integrations.swanlab.profiling import swanlab_profiling_context\n\nclass MyTrainer(AxolotlTrainer):\n    def complex_training_step(self, model, inputs):\n        # Profile just the forward pass\n        with swanlab_profiling_context(self, \"forward_pass\"):\n            outputs = model(**inputs)\n\n        # Profile just the backward pass\n        with swanlab_profiling_context(self, \"backward_pass\"):\n            loss = outputs.loss\n            loss.backward()\n\n        return outputs\n\n\nAdvanced Usage: ProfilingConfig\nFilter and throttle profiling logs with ProfilingConfig:\nfrom axolotl.integrations.swanlab.profiling import (\n    swanlab_profiling_context_advanced,\n    ProfilingConfig,\n)\n\nprofiling_config = ProfilingConfig(\n    enabled=True,\n    min_duration_ms=1.0,    # Only log if duration &gt; 1ms\n    log_interval=10,        # Log every 10th call\n)\n\nclass MyTrainer(AxolotlTrainer):\n    def frequently_called_method(self, data):\n        with swanlab_profiling_context_advanced(\n            self,\n            \"frequent_op\",\n            config=profiling_config\n        ):\n            # This only logs every 10th call, and only if it takes &gt; 1ms\n            result = expensive_computation(data)\n        return result\nProfilingConfig Parameters:\n- enabled: Enable/disable profiling globally (default: True)\n- min_duration_ms: Minimum duration to log in milliseconds (default: 0.1)\n- log_interval: Log every Nth function call (default: 1 = log all)\nUse cases:\n- High-frequency methods: Use log_interval=100 to reduce logging overhead\n- Filter noise: Use min_duration_ms=1.0 to skip very fast operations\n- Debugging: Use log_interval=1, min_duration_ms=0.0 to log everything\n\n\nViewing Profiling Metrics\nIn your SwanLab dashboard, profiling metrics appear under the “profiling” namespace:\nprofiling/Time taken: AxolotlTrainer.training_step\nprofiling/Time taken: AxolotlTrainer.prediction_step\nprofiling/Time taken: MyTrainer.forward_pass\nprofiling/Time taken: MyTrainer.backward_pass\nYou can:\n- Track over time: See if methods get faster/slower during training\n- Compare runs: Compare profiling metrics across experiments\n- Identify regressions: Detect if a code change slowed down training\n\n\nConfiguration in Axolotl Config\nProfiling is automatically enabled when SwanLab is enabled. No additional config needed:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: my-project\nTo disable profiling while keeping SwanLab enabled:\nfrom axolotl.integrations.swanlab.profiling import DEFAULT_PROFILING_CONFIG\n\nDEFAULT_PROFILING_CONFIG.enabled = False\n\n\nPerformance Impact\n\nDecorator overhead: ~2-5 microseconds per call (negligible)\nContext manager overhead: ~1-3 microseconds (negligible)\nLogging overhead: Only when SwanLab is enabled and method duration exceeds threshold\nNetwork overhead: SwanLab batches metrics efficiently\n\nExpected overhead: &lt; 0.1% per training step (effectively zero)\n\n\nBest Practices\n\nProfile bottlenecks first: Start by profiling suspected slow operations\nUse min_duration_ms: Filter out fast operations (&lt; 1ms) to reduce noise\nThrottle high-frequency calls: Use log_interval for methods called &gt; 100 times/step\nProfile across runs: Compare profiling metrics before/after optimization\nMonitor distributed training: Check for rank-specific slowdowns\n\n\n\nExample: Complete Profiling Setup\nfrom axolotl.integrations.swanlab.profiling import (\n    swanlab_profile,\n    swanlab_profiling_context,\n    ProfilingConfig,\n)\n\nclass OptimizedTrainer(AxolotlTrainer):\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        # Custom profiling config for high-frequency operations\n        self.fast_op_config = ProfilingConfig(\n            enabled=True,\n            min_duration_ms=0.5,\n            log_interval=50,\n        )\n\n    @swanlab_profile\n    def training_step(self, model, inputs):\n        \"\"\"Main training step - always profile.\"\"\"\n        return super().training_step(model, inputs)\n\n    @swanlab_profile\n    def compute_loss(self, model, inputs, return_outputs=False):\n        \"\"\"Loss computation - always profile.\"\"\"\n        return super().compute_loss(model, inputs, return_outputs)\n\n    def _prepare_inputs(self, inputs):\n        \"\"\"High-frequency operation - throttled profiling.\"\"\"\n        with swanlab_profiling_context_advanced(\n            self,\n            \"prepare_inputs\",\n            config=self.fast_op_config,\n        ):\n            return super()._prepare_inputs(inputs)\n\n\nTroubleshooting\n\nProfiling metrics not appearing in SwanLab\nCause: SwanLab is not enabled or not initialized.\nSolution:\nuse_swanlab: true\nswanlab_project: my-project\nCheck logs for:\nINFO: SwanLab initialized for project: my-project\n\n\nToo many profiling metrics cluttering dashboard\nCause: Profiling every function call for high-frequency operations.\nSolution: Use ProfilingConfig with throttling:\nconfig = ProfilingConfig(\n    min_duration_ms=1.0,    # Skip fast ops\n    log_interval=100,       # Log every 100th call\n)\n\n\nProfiling overhead impacting training speed\nCause: Profiling itself should have negligible overhead (&lt; 0.1%). If you see &gt; 1% slowdown, this indicates a bug.\nSolution:\n1. Disable profiling temporarily to confirm:\npython    DEFAULT_PROFILING_CONFIG.enabled = False\n2. Report issue with profiling data and trainer details\n\n\nProfiling shows inconsistent timing\nCause: Normal variation due to GPU warmup, data loading, or system load.\nSolution:\n- Ignore first few steps (warmup period)\n- Look at average/median timing over many steps\n- Use log_interval to reduce noise from individual outliers\n\n\n\nComplete Config Example\nHere’s a complete example integrating SwanLab with your RVQ-Alpha training:\nbase_model: /path/to/your/model\nmodel_type: Qwen2ForCausalLM\n\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\nuse_swanlab: true\nswanlab_project: RVQ-Alpha-Training\nswanlab_experiment_name: Qwen2.5-7B-MetaQA-Perturb-P020\nswanlab_description: \"Training on MetaQA and Perturbation datasets with NEW-RVQ encoding\"\nswanlab_mode: cloud\nswanlab_workspace: single-cell-genomics\n\nsequence_len: 32768\nmicro_batch_size: 1\ngradient_accumulation_steps: 1\nnum_epochs: 2\nlearning_rate: 2e-5\noptimizer: adamw_torch_fused\n\ndatasets:\n  - path: /path/to/dataset\n    type: chat_template\n\noutput_dir: ./outputs\n\n\nModes Explained\n\n\ncloud Mode (Default)\n\nSyncs experiments to SwanLab cloud in real-time\nRequires API key and internet connection\nBest for: Team collaboration, remote monitoring\n\n\n\nlocal Mode\n\nSaves experiments locally only\nNo cloud sync\nBest for: Local development, air-gapped environments\n\n\n\noffline Mode\n\nSaves metadata locally\nCan sync to cloud later using swanlab sync\nBest for: Unstable internet, sync later\n\n\n\ndisabled Mode\n\nTurns off SwanLab completely\nNo logging or tracking\nBest for: Debugging, testing\n\n\n\nConfiguration Validation & Conflict Detection\nSwanLab integration includes comprehensive validation and conflict detection to help you catch configuration errors early and avoid performance issues.\n\n\nRequired Fields Validation\nThe plugin validates your configuration at startup and provides clear error messages with solutions:\n\nMissing Project Name\nuse_swanlab: true\nSolution:\nuse_swanlab: true\nswanlab_project: my-project\n\n\nInvalid Mode\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_mode: invalid-mode\nSolution:\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_mode: cloud  # or: local, offline, disabled\n\n\nEmpty Project Name\nuse_swanlab: true\nswanlab_project: \"\"\nSolution:\nuse_swanlab: true\nswanlab_project: my-project\n\n\n\nCloud Mode API Key Warning\nWhen using cloud mode without an API key, you’ll receive a warning with multiple solutions:\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_mode: cloud\nSolutions:\n1. Set environment variable: export SWANLAB_API_KEY=your-api-key\n2. Add to config (less secure): swanlab_api_key: your-api-key\n3. Run swanlab login before training\n4. Use swanlab_mode: local for offline tracking\n\n\nMulti-Logger Performance Warnings\nUsing multiple logging tools simultaneously (SwanLab + WandB + MLflow + Comet) can impact training performance:\n\nTwo Loggers - Warning\nuse_swanlab: true\nswanlab_project: my-project\n\nuse_wandb: true\nwandb_project: my-project\nImpact:\n- Performance overhead: ~1-2% per logger (cumulative)\n- Increased memory usage\n- Longer training time per step\n- Potential config/callback conflicts\nRecommendations:\n- Choose ONE primary logging tool for production training\n- Use multiple loggers only for:\n- Migration period (transitioning between tools)\n- Short comparison runs\n- Debugging specific tool issues\n- Monitor system resources (CPU, memory) during training\n\n\nThree+ Loggers - Error-Level Warning\nuse_swanlab: true\nswanlab_project: my-project\n\nuse_wandb: true\nwandb_project: my-project\n\nuse_mlflow: true\nmlflow_tracking_uri: http://localhost:5000\nWhy This Matters:\n- With 3 loggers: ~4-5% overhead per step → significant slowdown over long training\n- Example: 10,000 steps at 2s/step → ~400-500 seconds extra (6-8 minutes)\n- Memory overhead scales with number of loggers\n- Rare edge cases with callback ordering conflicts\n\n\n\nAuto-Enable Logic\nFor convenience, SwanLab will auto-enable if you specify a project without setting use_swanlab:\nswanlab_project: my-project\n\nuse_swanlab: true\nswanlab_project: my-project\n\n\nDistributed Training Detection\nIn distributed training scenarios (multi-GPU), the plugin automatically detects and reports:\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_mode: cloud\nWhy Only Rank 0:\n- Avoids duplicate experiment runs\n- Reduces network/cloud API overhead on worker ranks\n- Prevents race conditions in metric logging\n\n\nAuthentication\n\n\nMethod 1: Environment Variable (Recommended)\nexport SWANLAB_API_KEY=your-api-key-here\n\n\nMethod 2: Login Command\nswanlab login\n\n\nMethod 3: Config File\nswanlab_api_key: your-api-key-here\n\n\nWhat Gets Logged?\n\n\nAutomatically Logged Metrics\n\nTraining loss\nLearning rate\nGradient norm\nTraining steps\nEpoch progress\n\n\n\nAutomatically Logged Config\n\nModel configuration (base_model, model_type)\nTraining hyperparameters (learning_rate, batch_size, etc.)\nOptimizer settings\nParallelization settings (FSDP, DeepSpeed, Context Parallel)\nAxolotl configuration file\nDeepSpeed configuration (if used)\n\n\n\nViewing Your Experiments\n\n\nCloud Mode\nVisit https://swanlab.cn and navigate to your project to view:\n- Real-time training metrics\n- Hyperparameter comparison\n- System resource usage\n- Configuration files\n\n\nLocal Mode\nswanlab watch ./swanlog\n\n\nIntegration with Existing Tools\nSwanLab can work alongside other tracking tools:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: my-project\n\nuse_wandb: true\nwandb_project: my-project\n\n\nTroubleshooting\n\n\nConfiguration Errors\n\nError: “SwanLab enabled but ‘swanlab_project’ is not set”\nCause: You enabled SwanLab (use_swanlab: true) but forgot to specify a project name.\nSolution:\nuse_swanlab: true\nswanlab_project: my-project  # Add this line\n\n\nError: “Invalid swanlab_mode: ‘xxx’”\nCause: You provided an invalid mode value.\nSolution: Use one of the valid modes:\nswanlab_mode: cloud     # or: local, offline, disabled\n\n\nError: “swanlab_project cannot be an empty string”\nCause: You set swanlab_project: \"\" (empty string).\nSolution: Either provide a valid name or remove the field:\nswanlab_project: my-project\n\n\n\nImport Errors\n\nError: “SwanLab is not installed”\nCause: SwanLab package is not installed in your environment.\nSolution:\npip install swanlab\npip install swanlab&gt;=0.3.0\n\n\n\nPerformance Issues\n\nWarning: “Multiple logging tools enabled”\nCause: You have multiple experiment tracking tools enabled (e.g., SwanLab + WandB + MLflow).\nImpact: ~1-2% performance overhead per logger, cumulative.\nSolution: For production training, disable all but one logger:\nuse_swanlab: true\nswanlab_project: my-project\nuse_wandb: false      # Disable others\nuse_mlflow: false\n\nuse_swanlab: false\nuse_wandb: true\nwandb_project: my-project\nException: Multiple loggers are acceptable for:\n- Short comparison runs (&lt; 100 steps)\n- Migration testing between logging tools\n- Debugging logger-specific issues\n\n\n\nDistributed Training Issues\n\nSwanLab creates duplicate runs in multi-GPU training\nCause: All ranks are initializing SwanLab instead of just rank 0.\nExpected Behavior: The plugin automatically ensures only rank 0 initializes SwanLab. You should see:\nInfo: Distributed training detected (world_size=4)\nInfo: Only rank 0 will initialize SwanLab\nInfo: Other ranks will skip SwanLab to avoid conflicts\nIf you see duplicates:\n1. Check your plugin is loaded correctly\n2. Verify you’re using the latest SwanLab integration code\n3. Check logs for initialization messages on all ranks\n\n\n\nSwanLab not logging metrics\nSolution: Ensure SwanLab is initialized before training starts. The plugin automatically handles this in pre_model_load.\n\n\nAPI Key errors\nSolution:\necho $SWANLAB_API_KEY\n\nswanlab login\n\n\nCloud sync issues\nSolution: Use offline mode and sync later:\nswanlab_mode: offline\nThen sync when ready:\nswanlab sync ./swanlog\n\n\nPlugin not loaded\nSolution: Verify plugin path in config:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin  # Correct path\n\n\nLark Notification Issues\n\nError: “Failed to import SwanLab Lark plugin”\nCause: Your SwanLab version doesn’t include the Lark plugin (requires SwanLab &gt;= 0.3.0).\nSolution:\npip install --upgrade swanlab\n\npip install 'swanlab&gt;=0.3.0'\n\n\nWarning: “Lark webhook has no secret configured”\nCause: You provided swanlab_lark_webhook_url but no swanlab_lark_secret.\nImpact: Lark notifications will work, but without HMAC authentication (security risk).\nSolution: Add HMAC secret for production use:\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx\nswanlab_lark_secret: your-webhook-secret  # Add this line\nWhen it’s OK to skip secret:\n- Local development and testing\n- Internal networks with restricted access\n- Non-sensitive training experiments\nWhen secret is required:\n- Production training jobs\n- Training with proprietary data\n- Multi-team shared Lark groups\n\n\nError: “Failed to register Lark callback”\nCause: Invalid webhook URL or network connectivity issues.\nDiagnostic steps:\ncurl -X POST \"YOUR_WEBHOOK_URL\" \\\n  -H 'Content-Type: application/json' \\\n  -d '{\"msg_type\":\"text\",\"content\":{\"text\":\"Test from Axolotl\"}}'\n\npip show swanlab\nSolution:\n1. Verify webhook URL is correct (copy from Lark bot settings)\n2. Check network connectivity to Lark API\n3. Ensure webhook is not expired (Lark webhooks can expire)\n4. Regenerate webhook URL in Lark bot settings if needed\n\n\nLark notifications not received\nCause: Multiple possible causes.\nDiagnostic checklist:\n\nCheck training logs for Lark registration confirmation:\n# Expected log message (rank 0 only):\nINFO: Registered Lark notification callback with HMAC authentication\nVerify webhook in Lark: Test webhook manually (see above)\nCheck distributed training: Only rank 0 sends notifications\n# If running multi-GPU, check rank 0 logs specifically\ngrep \"Registered Lark\" logs/rank_0.log\nVerify SwanLab is initialized: Lark callback needs SwanLab to be running\nuse_swanlab: true  # Must be enabled\nswanlab_project: my-project  # Must be set\nCheck Lark bot permissions: Ensure bot is added to the target group chat\n\n\n\nDuplicate Lark notifications in multi-GPU training\nExpected Behavior: Should NOT happen - only rank 0 sends notifications.\nIf you see duplicates:\n1. Check that all GPUs are using the same config file\n2. Verify plugin is loaded correctly on all ranks\n3. Check logs for unexpected Lark initialization on non-zero ranks\n4. Ensure RANK or LOCAL_RANK environment variables are set correctly\nSolution: This is a bug if it occurs. Report with:\n- Full training command\n- Logs from all ranks\n- Config file\n\n\n\nComparison: SwanLab vs WandB\n\n\n\nFeature\nSwanLab\nWandB\n\n\n\n\nOpen Source\n✅ Yes\n❌ No\n\n\nSelf-Hosting\n✅ Easy\n⚠️ Complex\n\n\nFree Tier\n✅ Generous\n⚠️ Limited\n\n\nChinese Support\n✅ Native\n⚠️ Limited\n\n\nOffline Mode\n✅ Full support\n✅ Supported\n\n\nIntegration\n🆕 New\n✅ Mature\n\n\n\n\n\nAdvanced Usage\n\n\nCustom Logging\nYou can add custom metrics in your callbacks:\nimport swanlab\n\nswanlab.log({\n    \"custom_metric\": value,\n    \"epoch\": epoch_num\n})\n\n\nExperiment Comparison\nswanlab compare run1 run2 run3\n\n\nSupport\n\nDocumentation: https://docs.swanlab.cn\nGitHub: https://github.com/SwanHubX/SwanLab\nIssues: Report bugs at GitHub Issues\n\n\n\nLicense\nThis integration follows the Axolotl Community License Agreement.\n\n\nAcknowledgements\nThis integration is built on top of:\n- SwanLab - Experiment tracking tool\n- Transformers - SwanLabCallback\n- Axolotl - Training framework\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#adding-a-new-integration",
+    "href": "docs/custom_integrations.html#adding-a-new-integration",
+    "title": "Custom Integrations",
+    "section": "Adding a new integration",
+    "text": "Adding a new integration\nPlugins can be used to customize the behavior of the training pipeline through hooks. See axolotl.integrations.BasePlugin for the possible hooks.\nTo add a new integration, please follow these steps:\n\nCreate a new folder in the src/axolotl/integrations directory.\nAdd any relevant files (LICENSE, README.md, ACKNOWLEDGEMENTS.md, etc.) to the new folder.\nAdd __init__.py and args.py files to the new folder.\n\n\n__init__.py should import the integration and hook into the appropriate functions.\nargs.py should define the arguments for the integration.\n\n\n(If applicable) Add CPU tests under tests/integrations or GPU tests under tests/e2e/integrations.\n\n\n\n\n\n\n\nTip\n\n\n\nSee src/axolotl/integrations/cut_cross_entropy for a minimal integration example.\n\n\n\n\n\n\n\n\nWarning\n\n\n\nIf you could not load your integration, please ensure you are pip installing in editable mode.\npip install -e .\nand correctly spelled the integration name in the config file.\nplugins:\n  - axolotl.integrations.your_integration_name.YourIntegrationPlugin\n\n\n\n\n\n\n\n\nNote\n\n\n\nIt is not necessary to place your integration in the integrations folder. It can be in any location, so long as it’s installed in a package in your python env.\nSee this repo for an example: https://github.com/axolotl-ai-cloud/diff-transformer",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "index.html",
+    "href": "index.html",
+    "title": "Axolotl",
+    "section": "",
+    "text": "A Free and Open Source LLM Fine-tuning Framework",
+    "crumbs": [
+      "Home"
+    ]
+  },
+  {
+    "objectID": "index.html#latest-updates",
+    "href": "index.html#latest-updates",
+    "title": "Axolotl",
+    "section": "🎉 Latest Updates",
+    "text": "🎉 Latest Updates\n\n2025/12: Axolotl now includes support for Kimi-Linear, Plano-Orchestrator, MiMo, InternVL 3.5, Olmo3, Trinity, and Ministral3.\n2025/10: New model support has been added in Axolotl for: Qwen3 Next, Qwen2.5-vl, Qwen3-vl, Qwen3, Qwen3MoE, Granite 4, HunYuan, Magistral 2509, Apertus, and Seed-OSS.\n2025/09: Axolotl now has text diffusion training. Read more here.\n2025/08: QAT has been updated to include NVFP4 support. See PR.\n2025/07:\n\nND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the blog post for more info.\nAxolotl adds more models: GPT-OSS, Gemma 3n, Liquid Foundation Model 2 (LFM2), and Arcee Foundation Models (AFM).\nFP8 finetuning with fp8 gather op is now possible in Axolotl via torchao. Get started here!\nVoxtral, Magistral 1.1, and Devstral with mistral-common tokenizer support has been integrated in Axolotl!\nTiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See examples for using ALST with Axolotl!\n\n2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the docs to learn more!\n\n\n\nExpand older updates\n\n\n2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the blog and docs to learn how to scale your context length when fine-tuning.\n2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See docs to start training your own Magistral models with Axolotl!\n2025/04: Llama 4 support has been added in Axolotl. See docs to start training your own Llama 4 models with Axolotl’s linearized version!\n2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the docs to fine-tune your own!\n2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the docs to give it a try.\n2025/02: Axolotl has added GRPO support. Dive into our blog and GRPO example and have some fun!\n2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See docs.",
+    "crumbs": [
+      "Home"
+    ]
+  },
+  {
+    "objectID": "index.html#overview",
+    "href": "index.html#overview",
+    "title": "Axolotl",
+    "section": "✨ Overview",
+    "text": "✨ Overview\nAxolotl is a free and open-source tool designed to streamline post-training and fine-tuning for the latest large language models (LLMs).\nFeatures:\n\nMultiple Model Support: Train various models like GPT-OSS, LLaMA, Mistral, Mixtral, Pythia, and many more models available on the Hugging Face Hub.\nMultimodal Training: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, and audio models like Voxtral with image, video, and audio support.\nTraining Methods: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), and Reward Modelling (RM) / Process Reward Modelling (PRM).\nEasy Configuration: Re-use a single YAML configuration file across the full fine-tuning pipeline: dataset preprocessing, training, evaluation, quantization, and inference.\nPerformance Optimizations: Multipacking, Flash Attention, Xformers, Flex Attention, Liger Kernel, Cut Cross Entropy, Sequence Parallelism (SP), LoRA optimizations, Multi-GPU training (FSDP1, FSDP2, DeepSpeed), Multi-node training (Torchrun, Ray), and many more!\nFlexible Dataset Handling: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.\nCloud Ready: We ship Docker images and also PyPI packages for use on cloud platforms and local hardware.",
+    "crumbs": [
+      "Home"
+    ]
+  },
+  {
+    "objectID": "index.html#quick-start---llm-fine-tuning-in-minutes",
+    "href": "index.html#quick-start---llm-fine-tuning-in-minutes",
+    "title": "Axolotl",
+    "section": "🚀 Quick Start - LLM Fine-tuning in Minutes",
+    "text": "🚀 Quick Start - LLM Fine-tuning in Minutes\nRequirements:\n\nNVIDIA GPU (Ampere or newer for bf16 and Flash Attention) or AMD GPU\nPython 3.11\nPyTorch ≥2.8.0\n\n\nGoogle Colab\n\n\n\nOpen In Colab\n\n\n\n\nInstallation\n\nUsing pip\npip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\n\n# Download example axolotl configs, deepspeed configs\naxolotl fetch examples\naxolotl fetch deepspeed_configs  # OPTIONAL\n\n\nUsing Docker\nInstalling with Docker can be less error prone than installing in your own environment.\ndocker run --gpus '\"all\"' --rm -it axolotlai/axolotl:main-latest\nOther installation approaches are described here.\n\n\nCloud Providers\n\n\nRunPod\nVast.ai\nPRIME Intellect\nModal\nNovita\nJarvisLabs.ai\nLatitude.sh\n\n\n\n\n\nYour First Fine-tune\n# Fetch axolotl examples\naxolotl fetch examples\n\n# Or, specify a custom path\naxolotl fetch examples --dest path/to/folder\n\n# Train a model using LoRA\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Check out our Getting Started Guide for a more detailed walkthrough.",
+    "crumbs": [
+      "Home"
+    ]
+  },
+  {
+    "objectID": "index.html#documentation",
+    "href": "index.html#documentation",
+    "title": "Axolotl",
+    "section": "📚 Documentation",
+    "text": "📚 Documentation\n\nInstallation Options - Detailed setup instructions for different environments\nConfiguration Guide - Full configuration options and examples\nDataset Loading - Loading datasets from various sources\nDataset Guide - Supported formats and how to use them\nMulti-GPU Training\nMulti-Node Training\nMultipacking\nAPI Reference - Auto-generated code documentation\nFAQ - Frequently asked questions",
+    "crumbs": [
+      "Home"
+    ]
+  },
+  {
+    "objectID": "index.html#getting-help",
+    "href": "index.html#getting-help",
+    "title": "Axolotl",
+    "section": "🤝 Getting Help",
+    "text": "🤝 Getting Help\n\nJoin our Discord community for support\nCheck out our Examples directory\nRead our Debugging Guide\nNeed dedicated support? Please contact ✉️wing@axolotl.ai for options",
+    "crumbs": [
+      "Home"
+    ]
+  },
+  {
+    "objectID": "index.html#contributing",
+    "href": "index.html#contributing",
+    "title": "Axolotl",
+    "section": "🌟 Contributing",
+    "text": "🌟 Contributing\nContributions are welcome! Please see our Contributing Guide for details.",
+    "crumbs": [
+      "Home"
+    ]
+  },
+  {
+    "objectID": "index.html#telemetry",
+    "href": "index.html#telemetry",
+    "title": "Axolotl",
+    "section": "📈 Telemetry",
+    "text": "📈 Telemetry\nAxolotl has opt-out telemetry that helps us understand how the project is being used\nand prioritize improvements. We collect basic system information, model types, and\nerror rates—never personal data or file paths. Telemetry is enabled by default. To\ndisable it, set AXOLOTL_DO_NOT_TRACK=1. For more details, see our telemetry documentation.",
+    "crumbs": [
+      "Home"
+    ]
+  },
+  {
+    "objectID": "index.html#sponsors",
+    "href": "index.html#sponsors",
+    "title": "Axolotl",
+    "section": "❤️ Sponsors",
+    "text": "❤️ Sponsors\nInterested in sponsoring? Contact us at wing@axolotl.ai",
+    "crumbs": [
+      "Home"
+    ]
+  },
+  {
+    "objectID": "index.html#citing-axolotl",
+    "href": "index.html#citing-axolotl",
+    "title": "Axolotl",
+    "section": "📝 Citing Axolotl",
+    "text": "📝 Citing Axolotl\nIf you use Axolotl in your research or projects, please cite it as follows:\n@software{axolotl,\n  title = {Axolotl: Open Source LLM Post-Training},\n  author = {{Axolotl maintainers and contributors}},\n  url = {https://github.com/axolotl-ai-cloud/axolotl},\n  license = {Apache-2.0},\n  year = {2023}\n}",
+    "crumbs": [
+      "Home"
+    ]
+  },
+  {
+    "objectID": "index.html#license",
+    "href": "index.html#license",
+    "title": "Axolotl",
+    "section": "📜 License",
+    "text": "📜 License\nThis project is licensed under the Apache 2.0 License - see the LICENSE file for details.",
+    "crumbs": [
+      "Home"
+    ]
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html",
+    "title": "Fine-Tune Qwen3 14B with Axolotl",
+    "section": "",
+    "text": "Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money."
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#demo-talk-like-a-pirate",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#demo-talk-like-a-pirate",
+    "title": "Fine-Tune Qwen3 14B with Axolotl",
+    "section": "Demo: Talk Like a Pirate",
+    "text": "Demo: Talk Like a Pirate\nIn this demo, we are training the model to respond like a pirate. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab.\n\nUpload your own dataset or use a Huggingface dataset\nYou can choose to use your own JSONL file from your own Google Drive; for example downloading the Pirate-Ultrachat JSONL to your Google Drive. JSONL datasets should be formatted similar to the OpenAI dataset format.\nYou can also simply use the winglian/pirate-ultrachat-10k dataset directly.\n\n# Default to HF dataset location\ndataset_id = \"winglian/pirate-ultrachat-10k\"\nuploaded = {}\n\n\nimport os\n\n# Optionally, upload your own JSONL to your Google Drive\nGOOGLE_DRIVE_PATH = \"\"  # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n\n# \"Select All\" permissions, or you may get the error:\n# \"MessageError: Error: credential propagation was unsuccessful\"\nif GOOGLE_DRIVE_PATH:\n    from google.colab import drive\n\n    # Mount your Google Drive\n    GOOGLE_DRIVE_MNT = \"/content/drive/\"\n    drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n    tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(\"/\"))\n    # make sure file exists\n    if not os.path.isfile(tmp_path):\n        raise ValueError(f\"File {tmp_path} does not exist\")\n    dataset_id = tmp_path"
+  },
+  {
+    "objectID": "FAQS.html",
+    "href": "FAQS.html",
+    "title": "FAQs",
+    "section": "",
+    "text": "FAQs\n\nCan you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this PR\nWill this work with Deepspeed? That’s still a WIP, but setting export ACCELERATE_USE_DEEPSPEED=true should work in some cases\nError invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c\n/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.\nThis could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source."
+  },
+  {
+    "objectID": "docs/inference.html",
+    "href": "docs/inference.html",
+    "title": "Inference and Merging",
+    "section": "",
+    "text": "This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-quickstart",
+    "href": "docs/inference.html#sec-quickstart",
+    "title": "Inference and Merging",
+    "section": "1 Quick Start",
+    "text": "1 Quick Start\n\n\n\n\n\n\nTip\n\n\n\nUse the same config used for training on inference/merging.\n\n\n\n1.1 Basic Inference\n\nLoRA ModelsFull Fine-tuned Models\n\n\naxolotl inference your_config.yml --lora-model-dir=\"./lora-output-dir\"\n\n\naxolotl inference your_config.yml --base-model=\"./completed-model\"",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-advanced",
+    "href": "docs/inference.html#sec-advanced",
+    "title": "Inference and Merging",
+    "section": "2 Advanced Usage",
+    "text": "2 Advanced Usage\n\n2.1 Gradio Interface\nLaunch an interactive web interface:\naxolotl inference your_config.yml --gradio\n\n\n2.2 File-based Prompts\nProcess prompts from a text file:\ncat /tmp/prompt.txt | axolotl inference your_config.yml \\\n  --base-model=\"./completed-model\" --prompter=None\n\n\n2.3 Memory Optimization\nFor large models or limited memory:\naxolotl inference your_config.yml --load-in-8bit=True",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-merging",
+    "href": "docs/inference.html#sec-merging",
+    "title": "Inference and Merging",
+    "section": "3 Merging LoRA Weights",
+    "text": "3 Merging LoRA Weights\nMerge LoRA adapters with the base model:\naxolotl merge-lora your_config.yml --lora-model-dir=\"./completed-model\"\n\n3.1 Memory Management for Merging\n\nConfiguration OptionsForce CPU Merging\n\n\ngpu_memory_limit: 20GiB  # Adjust based on your GPU\nlora_on_cpu: true        # Process on CPU if needed\n\n\nCUDA_VISIBLE_DEVICES=\"\" axolotl merge-lora ...",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-tokenization",
+    "href": "docs/inference.html#sec-tokenization",
+    "title": "Inference and Merging",
+    "section": "4 Tokenization",
+    "text": "4 Tokenization\n\n4.1 Common Issues\n\n\n\n\n\n\nWarning\n\n\n\nTokenization mismatches between training and inference are a common source of problems.\n\n\nTo debug:\n\nCheck training tokenization:\n\naxolotl preprocess your_config.yml --debug\n\nVerify inference tokenization by decoding tokens before model input\nCompare token IDs between training and inference\n\n\n\n4.2 Special Tokens\nConfigure special tokens in your YAML:\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\ntokens:\n  - \"&lt;|im_start|&gt;\"\n  - \"&lt;|im_end|&gt;\"",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-troubleshooting",
+    "href": "docs/inference.html#sec-troubleshooting",
+    "title": "Inference and Merging",
+    "section": "5 Troubleshooting",
+    "text": "5 Troubleshooting\n\n5.1 Common Problems\n\nMemory IssuesToken IssuesPerformance Issues\n\n\n\nUse 8-bit loading\nReduce batch sizes\nTry CPU offloading\n\n\n\n\nVerify special tokens\nCheck tokenizer settings\nCompare training and inference preprocessing\n\n\n\n\nVerify model loading\nCheck prompt formatting\nEnsure temperature/sampling settings\n\n\n\n\nFor more details, see our debugging guide.",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.trainer.html",
-    "href": "docs/api/core.trainers.grpo.trainer.html",
-    "title": "core.trainers.grpo.trainer",
+    "objectID": "docs/api/core.datasets.transforms.chat_builder.html",
+    "href": "docs/api/core.datasets.transforms.chat_builder.html",
+    "title": "core.datasets.transforms.chat_builder",
     "section": "",
-    "text": "core.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
+    "text": "core.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\ndataset and converts it to a Chat.\n\n\n\n\n\nName\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='messages',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “messages”.\n'messages'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.trainer.html#classes",
-    "href": "docs/api/core.trainers.grpo.trainer.html#classes",
-    "title": "core.trainers.grpo.trainer",
+    "objectID": "docs/api/core.datasets.transforms.chat_builder.html#functions",
+    "href": "docs/api/core.datasets.transforms.chat_builder.html#functions",
+    "title": "core.datasets.transforms.chat_builder",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
+    "text": "Name\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='messages',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “messages”.\n'messages'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
   },
   {
-    "objectID": "docs/api/integrations.lm_eval.args.html",
-    "href": "docs/api/integrations.lm_eval.args.html",
-    "title": "integrations.lm_eval.args",
+    "objectID": "docs/api/train.html",
+    "href": "docs/api/train.html",
+    "title": "train",
     "section": "",
-    "text": "integrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
+    "text": "train\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
   },
   {
-    "objectID": "docs/api/integrations.lm_eval.args.html#classes",
-    "href": "docs/api/integrations.lm_eval.args.html#classes",
-    "title": "integrations.lm_eval.args",
+    "objectID": "docs/api/train.html#functions",
+    "href": "docs/api/train.html#functions",
+    "title": "train",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
+    "text": "Name\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
   },
   {
-    "objectID": "docs/api/utils.collators.core.html",
-    "href": "docs/api/utils.collators.core.html",
-    "title": "utils.collators.core",
+    "objectID": "docs/api/utils.schemas.training.html",
+    "href": "docs/api/utils.schemas.training.html",
+    "title": "utils.schemas.training",
     "section": "",
-    "text": "utils.collators.core\nutils.collators.core\nbasic shared collator constants"
+    "text": "utils.schemas.training\nPydantic models for training hyperparameters\n\n\n\n\n\nName\nDescription\n\n\n\n\nHyperparametersConfig\nTraining hyperparams configuration subset\n\n\nJaggedLRConfig\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\nLrGroup\nCustom learning rate group configuration\n\n\n\n\n\nutils.schemas.training.HyperparametersConfig()\nTraining hyperparams configuration subset\n\n\n\nutils.schemas.training.JaggedLRConfig()\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\n\nutils.schemas.training.LrGroup()\nCustom learning rate group configuration"
   },
   {
-    "objectID": "docs/api/core.chat.format.shared.html",
-    "href": "docs/api/core.chat.format.shared.html",
-    "title": "core.chat.format.shared",
+    "objectID": "docs/api/utils.schemas.training.html#classes",
+    "href": "docs/api/utils.schemas.training.html#classes",
+    "title": "utils.schemas.training",
     "section": "",
-    "text": "core.chat.format.shared\ncore.chat.format.shared\nshared functions for format transforms"
+    "text": "Name\nDescription\n\n\n\n\nHyperparametersConfig\nTraining hyperparams configuration subset\n\n\nJaggedLRConfig\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\nLrGroup\nCustom learning rate group configuration\n\n\n\n\n\nutils.schemas.training.HyperparametersConfig()\nTraining hyperparams configuration subset\n\n\n\nutils.schemas.training.JaggedLRConfig()\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\n\nutils.schemas.training.LrGroup()\nCustom learning rate group configuration"
   },
   {
-    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html",
-    "href": "docs/api/prompt_strategies.orpo.chat_template.html",
-    "title": "prompt_strategies.orpo.chat_template",
+    "objectID": "docs/api/kernels.quantize.html",
+    "href": "docs/api/kernels.quantize.html",
+    "title": "kernels.quantize",
     "section": "",
-    "text": "prompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\n\n\n\nName\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_input_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_input_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
+    "text": "kernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
   },
   {
-    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
-    "href": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
-    "title": "prompt_strategies.orpo.chat_template",
+    "objectID": "docs/api/kernels.quantize.html#functions",
+    "href": "docs/api/kernels.quantize.html#functions",
+    "title": "kernels.quantize",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_input_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_input_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels"
+    "text": "Name\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
   },
   {
-    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
-    "href": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
-    "title": "prompt_strategies.orpo.chat_template",
+    "objectID": "docs/api/utils.dict.html",
+    "href": "docs/api/utils.dict.html",
+    "title": "utils.dict",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
+    "text": "utils.dict\nModule containing the DictDefault class\n\n\n\n\n\nName\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
   },
   {
-    "objectID": "docs/api/utils.samplers.multipack.html",
-    "href": "docs/api/utils.samplers.multipack.html",
-    "title": "utils.samplers.multipack",
+    "objectID": "docs/api/utils.dict.html#classes",
+    "href": "docs/api/utils.dict.html#classes",
+    "title": "utils.dict",
     "section": "",
-    "text": "utils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\ninto fixed-capacity batches to optimize memory usage and training throughput.\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    bin_size,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
+    "text": "Name\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys."
   },
   {
-    "objectID": "docs/api/utils.samplers.multipack.html#classes",
-    "href": "docs/api/utils.samplers.multipack.html#classes",
-    "title": "utils.samplers.multipack",
+    "objectID": "docs/api/utils.dict.html#functions",
+    "href": "docs/api/utils.dict.html#functions",
+    "title": "utils.dict",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    bin_size,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs"
+    "text": "Name\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
   },
   {
-    "objectID": "docs/api/utils.samplers.multipack.html#functions",
-    "href": "docs/api/utils.samplers.multipack.html#functions",
-    "title": "utils.samplers.multipack",
+    "objectID": "docs/api/utils.quantization.html",
+    "href": "docs/api/utils.quantization.html",
+    "title": "utils.quantization",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
+    "text": "utils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_qat_model\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\nget_quantization_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model\nThis function is used to quantize a model.\n\n\n\n\n\nutils.quantization.convert_qat_model(model, quantize_embedding=False)\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\n\nutils.quantization.get_quantization_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
   },
   {
-    "objectID": "docs/api/utils.callbacks.qat.html",
-    "href": "docs/api/utils.callbacks.qat.html",
-    "title": "utils.callbacks.qat",
+    "objectID": "docs/api/utils.quantization.html#functions",
+    "href": "docs/api/utils.quantization.html#functions",
+    "title": "utils.quantization",
     "section": "",
-    "text": "utils.callbacks.qat\nQAT Callback for HF Causal Trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nconvert_qat_model\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\nget_quantization_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model\nThis function is used to quantize a model.\n\n\n\n\n\nutils.quantization.convert_qat_model(model, quantize_embedding=False)\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\n\nutils.quantization.get_quantization_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
   },
   {
-    "objectID": "docs/api/utils.callbacks.qat.html#classes",
-    "href": "docs/api/utils.callbacks.qat.html#classes",
-    "title": "utils.callbacks.qat",
+    "objectID": "docs/api/utils.collators.batching.html",
+    "href": "docs/api/utils.collators.batching.html",
+    "title": "utils.collators.batching",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model."
+    "text": "utils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\n\n\n\nName\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
   },
   {
-    "objectID": "docs/api/utils.callbacks.qat.html#functions",
-    "href": "docs/api/utils.callbacks.qat.html#functions",
-    "title": "utils.callbacks.qat",
+    "objectID": "docs/api/utils.collators.batching.html#classes",
+    "href": "docs/api/utils.collators.batching.html#classes",
+    "title": "utils.collators.batching",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
   },
   {
-    "objectID": "docs/api/prompt_strategies.chat_template.html",
-    "href": "docs/api/prompt_strategies.chat_template.html",
-    "title": "prompt_strategies.chat_template",
+    "objectID": "docs/api/monkeypatch.multipack.html",
+    "href": "docs/api/monkeypatch.multipack.html",
+    "title": "monkeypatch.multipack",
     "section": "",
-    "text": "prompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n    real_last_index=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
+    "text": "monkeypatch.multipack\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing"
   },
   {
-    "objectID": "docs/api/prompt_strategies.chat_template.html#classes",
-    "href": "docs/api/prompt_strategies.chat_template.html#classes",
-    "title": "prompt_strategies.chat_template",
+    "objectID": "docs/api/cli.config.html",
+    "href": "docs/api/cli.config.html",
+    "title": "cli.config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n    real_last_index=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
+    "text": "cli.config\nConfiguration loading and processing.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.schemas.multimodal.html",
-    "href": "docs/api/utils.schemas.multimodal.html",
-    "title": "utils.schemas.multimodal",
+    "objectID": "docs/api/cli.config.html#functions",
+    "href": "docs/api/cli.config.html#functions",
+    "title": "cli.config",
     "section": "",
-    "text": "utils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
+    "text": "Name\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.schemas.multimodal.html#classes",
-    "href": "docs/api/utils.schemas.multimodal.html#classes",
-    "title": "utils.schemas.multimodal",
+    "objectID": "docs/api/core.trainers.mixins.optimizer.html",
+    "href": "docs/api/core.trainers.mixins.optimizer.html",
+    "title": "core.trainers.mixins.optimizer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
+    "text": "core.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nOptimizerInitMixin\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerInitMixin(*args, **kwargs)\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\naccept optimizer_cls_and_kwargs as kwarg in constructor.\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
   },
   {
-    "objectID": "docs/api/utils.callbacks.comet_.html",
-    "href": "docs/api/utils.callbacks.comet_.html",
-    "title": "utils.callbacks.comet_",
+    "objectID": "docs/api/core.trainers.mixins.optimizer.html#classes",
+    "href": "docs/api/core.trainers.mixins.optimizer.html#classes",
+    "title": "core.trainers.mixins.optimizer",
     "section": "",
-    "text": "utils.callbacks.comet_\nComet module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoCometCallback\nCallback to save axolotl config to comet\n\n\n\n\n\nutils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(axolotl_config_path)\nCallback to save axolotl config to comet"
+    "text": "Name\nDescription\n\n\n\n\nOptimizerInitMixin\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerInitMixin(*args, **kwargs)\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\naccept optimizer_cls_and_kwargs as kwarg in constructor.\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
   },
   {
-    "objectID": "docs/api/utils.callbacks.comet_.html#classes",
-    "href": "docs/api/utils.callbacks.comet_.html#classes",
-    "title": "utils.callbacks.comet_",
+    "objectID": "docs/api/utils.lora.html",
+    "href": "docs/api/utils.lora.html",
+    "title": "utils.lora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoCometCallback\nCallback to save axolotl config to comet\n\n\n\n\n\nutils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(axolotl_config_path)\nCallback to save axolotl config to comet"
+    "text": "utils.lora\nmodule to get the state dict of a merged lora model\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_lora_merged_state_dict\nCreate and return a state_dict that has the LoRA deltas\n\n\n\n\n\nutils.lora.get_lora_merged_state_dict(model)\nCreate and return a state_dict that has the LoRA deltas\nmerged into the base model’s weights, without modifying model in place.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\ntorch.nn.Module\nA model that has LoRA/PEFT adapters attached.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndict\ndict\nA state_dict of the merged parameters."
   },
   {
-    "objectID": "docs/api/prompt_strategies.base.html",
-    "href": "docs/api/prompt_strategies.base.html",
-    "title": "prompt_strategies.base",
+    "objectID": "docs/api/utils.lora.html#functions",
+    "href": "docs/api/utils.lora.html#functions",
+    "title": "utils.lora",
     "section": "",
-    "text": "prompt_strategies.base\nprompt_strategies.base\nmodule for base dataset transform strategies"
+    "text": "Name\nDescription\n\n\n\n\nget_lora_merged_state_dict\nCreate and return a state_dict that has the LoRA deltas\n\n\n\n\n\nutils.lora.get_lora_merged_state_dict(model)\nCreate and return a state_dict that has the LoRA deltas\nmerged into the base model’s weights, without modifying model in place.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\ntorch.nn.Module\nA model that has LoRA/PEFT adapters attached.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndict\ndict\nA state_dict of the merged parameters."
   },
   {
-    "objectID": "docs/api/kernels.utils.html",
-    "href": "docs/api/kernels.utils.html",
-    "title": "kernels.utils",
+    "objectID": "docs/api/core.trainers.grpo.sampler.html",
+    "href": "docs/api/core.trainers.grpo.sampler.html",
+    "title": "core.trainers.grpo.sampler",
     "section": "",
-    "text": "kernels.utils\nkernels.utils\nUtilities for axolotl.kernels submodules."
+    "text": "core.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\nhttps://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds\nsequence parallelism functionality; i.e., duplicating data across ranks in the same\nsequence parallel group.\n\n\n\n\n\nName\nDescription\n\n\n\n\nSequenceParallelRepeatRandomSampler\nSampler for GRPO training with sequence parallelism.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler(\n    dataset,\n    mini_repeat_count,\n    world_size,\n    rank,\n    batch_size=1,\n    repeat_count=1,\n    context_parallel_size=1,\n    shuffle=True,\n    seed=0,\n    drop_last=False,\n)\nSampler for GRPO training with sequence parallelism.\nThis sampler ensures:\n- Ranks in the same sequence parallel (SP) group receive identical data.\n- Each index is repeated multiple times for sampling different completions.\n- Entire batches are repeated for reuse in multiple updates.\n- Data is properly distributed across SP groups.\nIn the table below, the values represent dataset indices. Each SP group has\ncontext_parallel_size = 2 GPUs working together on the same data. There are 2\nSP groups (SP0 and SP1), with world_size = 4 total GPUs.\n                                       Sequence Parallel Groups\n                                |       SP0        |       SP1        |\n                                |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |\n            global_step  step    &lt;---&gt; mini_repeat_count=3\n                                    &lt;----------&gt; batch_size=2 per SP group\ngrad_accum=2 ▲ ▲ 0 0 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- SP groups get different data\n▼ | 0 1 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Same data for each SP group GPU\n|\n| 1 2 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Repeat same indices for iterations\nnum_iterations=2 ▼ 1 3 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- When using gradient accumulation\n                 2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   &lt;- New batch of data indices\n                 2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]\n                                    ...\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataset\nSized\nDataset to sample from.\nrequired\n\n\nmini_repeat_count\nint\nHow many times to repeat each sample immediately.\nrequired\n\n\nworld_size\nint\nTotal number of processes.\nrequired\n\n\nrank\nint\nRank of current process.\nrequired\n\n\nbatch_size\nint\nNumber of samples per batch.\n1\n\n\nrepeat_count\nint\nHow many times to repeat the full sampling process.\n1\n\n\ncontext_parallel_size\nint\nNumber of ranks in a sequence parallel group.\n1\n\n\nshuffle\nbool\nWhether to shuffle the dataset.\nTrue\n\n\nseed\nint\nRandom seed for shuffling.\n0\n\n\ndrop_last\nbool\nWhether to drop the last incomplete batch.\nFalse\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nset_epoch\nSets the epoch for this sampler.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler.set_epoch(epoch)\nSets the epoch for this sampler.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nepoch\nint\nEpoch number to use for shuffling.\nrequired"
   },
   {
-    "objectID": "docs/api/cli.merge_lora.html",
-    "href": "docs/api/cli.merge_lora.html",
-    "title": "cli.merge_lora",
+    "objectID": "docs/api/core.trainers.grpo.sampler.html#classes",
+    "href": "docs/api/core.trainers.grpo.sampler.html#classes",
+    "title": "core.trainers.grpo.sampler",
     "section": "",
-    "text": "cli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nSequenceParallelRepeatRandomSampler\nSampler for GRPO training with sequence parallelism.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler(\n    dataset,\n    mini_repeat_count,\n    world_size,\n    rank,\n    batch_size=1,\n    repeat_count=1,\n    context_parallel_size=1,\n    shuffle=True,\n    seed=0,\n    drop_last=False,\n)\nSampler for GRPO training with sequence parallelism.\nThis sampler ensures:\n- Ranks in the same sequence parallel (SP) group receive identical data.\n- Each index is repeated multiple times for sampling different completions.\n- Entire batches are repeated for reuse in multiple updates.\n- Data is properly distributed across SP groups.\nIn the table below, the values represent dataset indices. Each SP group has\ncontext_parallel_size = 2 GPUs working together on the same data. There are 2\nSP groups (SP0 and SP1), with world_size = 4 total GPUs.\n                                       Sequence Parallel Groups\n                                |       SP0        |       SP1        |\n                                |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |\n            global_step  step    &lt;---&gt; mini_repeat_count=3\n                                    &lt;----------&gt; batch_size=2 per SP group\ngrad_accum=2 ▲ ▲ 0 0 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- SP groups get different data\n▼ | 0 1 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Same data for each SP group GPU\n|\n| 1 2 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Repeat same indices for iterations\nnum_iterations=2 ▼ 1 3 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- When using gradient accumulation\n                 2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   &lt;- New batch of data indices\n                 2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]\n                                    ...\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataset\nSized\nDataset to sample from.\nrequired\n\n\nmini_repeat_count\nint\nHow many times to repeat each sample immediately.\nrequired\n\n\nworld_size\nint\nTotal number of processes.\nrequired\n\n\nrank\nint\nRank of current process.\nrequired\n\n\nbatch_size\nint\nNumber of samples per batch.\n1\n\n\nrepeat_count\nint\nHow many times to repeat the full sampling process.\n1\n\n\ncontext_parallel_size\nint\nNumber of ranks in a sequence parallel group.\n1\n\n\nshuffle\nbool\nWhether to shuffle the dataset.\nTrue\n\n\nseed\nint\nRandom seed for shuffling.\n0\n\n\ndrop_last\nbool\nWhether to drop the last incomplete batch.\nFalse\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nset_epoch\nSets the epoch for this sampler.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler.set_epoch(epoch)\nSets the epoch for this sampler.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nepoch\nint\nEpoch number to use for shuffling.\nrequired"
   },
   {
-    "objectID": "docs/api/cli.merge_lora.html#functions",
-    "href": "docs/api/cli.merge_lora.html#functions",
-    "title": "cli.merge_lora",
+    "objectID": "docs/api/prompt_strategies.orcamini.html",
+    "href": "docs/api/prompt_strategies.orcamini.html",
+    "title": "prompt_strategies.orcamini",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "prompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\nsee also https://huggingface.co/psmathur/orca_mini_v2_7b for more information\nUse dataset type: orcamini in conig.yml to use this prompt style.\nCompared to the alpaca_w_system.open_orca dataset type,\nthis one specifies the system prompt with “### System:”.\nNot suited/tested for multiple-turn conversations without further adjustments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
   },
   {
-    "objectID": "docs/api/cli.utils.html",
-    "href": "docs/api/cli.utils.html",
-    "title": "cli.utils",
+    "objectID": "docs/api/prompt_strategies.orcamini.html#classes",
+    "href": "docs/api/prompt_strategies.orcamini.html#classes",
+    "title": "prompt_strategies.orcamini",
     "section": "",
-    "text": "cli.utils\ncli.utils\nInit for axolotl.cli.utils module."
+    "text": "Name\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
   },
   {
-    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html",
-    "href": "docs/api/utils.ctx_managers.sequence_parallel.html",
-    "title": "utils.ctx_managers.sequence_parallel",
+    "objectID": "docs/api/monkeypatch.mixtral.html",
+    "href": "docs/api/monkeypatch.mixtral.html",
+    "title": "monkeypatch.mixtral",
     "section": "",
-    "text": "utils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
+    "text": "monkeypatch.mixtral\nmonkeypatch.mixtral\nPatches to support multipack for mixtral"
   },
   {
-    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
-    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
-    "title": "utils.ctx_managers.sequence_parallel",
+    "objectID": "docs/api/prompt_strategies.dpo.llama3.html",
+    "href": "docs/api/prompt_strategies.dpo.llama3.html",
+    "title": "prompt_strategies.dpo.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired"
+    "text": "prompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
-    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
-    "title": "utils.ctx_managers.sequence_parallel",
+    "objectID": "docs/api/prompt_strategies.dpo.llama3.html#functions",
+    "href": "docs/api/prompt_strategies.dpo.llama3.html#functions",
+    "title": "prompt_strategies.dpo.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
     "objectID": "docs/api/index.html",
@@ -2802,2598 +3828,1649 @@
     "text": "Training callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks\n\n\nutils.callbacks.qat\nQAT Callback for HF Causal Trainer"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.llama3.html",
-    "href": "docs/api/prompt_strategies.dpo.llama3.html",
-    "title": "prompt_strategies.dpo.llama3",
+    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html",
+    "href": "docs/api/utils.ctx_managers.sequence_parallel.html",
+    "title": "utils.ctx_managers.sequence_parallel",
     "section": "",
-    "text": "prompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "utils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.llama3.html#functions",
-    "href": "docs/api/prompt_strategies.dpo.llama3.html#functions",
-    "title": "prompt_strategies.dpo.llama3",
+    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
+    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
+    "title": "utils.ctx_managers.sequence_parallel",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "Name\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired"
   },
   {
-    "objectID": "docs/api/monkeypatch.mixtral.html",
-    "href": "docs/api/monkeypatch.mixtral.html",
-    "title": "monkeypatch.mixtral",
+    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
+    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
+    "title": "utils.ctx_managers.sequence_parallel",
     "section": "",
-    "text": "monkeypatch.mixtral\nmonkeypatch.mixtral\nPatches to support multipack for mixtral"
+    "text": "Name\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
   },
   {
-    "objectID": "docs/api/prompt_strategies.orcamini.html",
-    "href": "docs/api/prompt_strategies.orcamini.html",
-    "title": "prompt_strategies.orcamini",
+    "objectID": "docs/api/cli.utils.html",
+    "href": "docs/api/cli.utils.html",
+    "title": "cli.utils",
     "section": "",
-    "text": "prompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\nsee also https://huggingface.co/psmathur/orca_mini_v2_7b for more information\nUse dataset type: orcamini in conig.yml to use this prompt style.\nCompared to the alpaca_w_system.open_orca dataset type,\nthis one specifies the system prompt with “### System:”.\nNot suited/tested for multiple-turn conversations without further adjustments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
+    "text": "cli.utils\ncli.utils\nInit for axolotl.cli.utils module."
   },
   {
-    "objectID": "docs/api/prompt_strategies.orcamini.html#classes",
-    "href": "docs/api/prompt_strategies.orcamini.html#classes",
-    "title": "prompt_strategies.orcamini",
+    "objectID": "docs/api/cli.merge_lora.html",
+    "href": "docs/api/cli.merge_lora.html",
+    "title": "cli.merge_lora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
+    "text": "cli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.sampler.html",
-    "href": "docs/api/core.trainers.grpo.sampler.html",
-    "title": "core.trainers.grpo.sampler",
+    "objectID": "docs/api/cli.merge_lora.html#functions",
+    "href": "docs/api/cli.merge_lora.html#functions",
+    "title": "cli.merge_lora",
     "section": "",
-    "text": "core.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\nhttps://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds\nsequence parallelism functionality; i.e., duplicating data across ranks in the same\nsequence parallel group.\n\n\n\n\n\nName\nDescription\n\n\n\n\nSequenceParallelRepeatRandomSampler\nSampler for GRPO training with sequence parallelism.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler(\n    dataset,\n    mini_repeat_count,\n    world_size,\n    rank,\n    batch_size=1,\n    repeat_count=1,\n    context_parallel_size=1,\n    shuffle=True,\n    seed=0,\n    drop_last=False,\n)\nSampler for GRPO training with sequence parallelism.\nThis sampler ensures:\n- Ranks in the same sequence parallel (SP) group receive identical data.\n- Each index is repeated multiple times for sampling different completions.\n- Entire batches are repeated for reuse in multiple updates.\n- Data is properly distributed across SP groups.\nIn the table below, the values represent dataset indices. Each SP group has\ncontext_parallel_size = 2 GPUs working together on the same data. There are 2\nSP groups (SP0 and SP1), with world_size = 4 total GPUs.\n                                       Sequence Parallel Groups\n                                |       SP0        |       SP1        |\n                                |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |\n            global_step  step    &lt;---&gt; mini_repeat_count=3\n                                    &lt;----------&gt; batch_size=2 per SP group\ngrad_accum=2 ▲ ▲ 0 0 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- SP groups get different data\n▼ | 0 1 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Same data for each SP group GPU\n|\n| 1 2 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Repeat same indices for iterations\nnum_iterations=2 ▼ 1 3 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- When using gradient accumulation\n                 2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   &lt;- New batch of data indices\n                 2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]\n                                    ...\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataset\nSized\nDataset to sample from.\nrequired\n\n\nmini_repeat_count\nint\nHow many times to repeat each sample immediately.\nrequired\n\n\nworld_size\nint\nTotal number of processes.\nrequired\n\n\nrank\nint\nRank of current process.\nrequired\n\n\nbatch_size\nint\nNumber of samples per batch.\n1\n\n\nrepeat_count\nint\nHow many times to repeat the full sampling process.\n1\n\n\ncontext_parallel_size\nint\nNumber of ranks in a sequence parallel group.\n1\n\n\nshuffle\nbool\nWhether to shuffle the dataset.\nTrue\n\n\nseed\nint\nRandom seed for shuffling.\n0\n\n\ndrop_last\nbool\nWhether to drop the last incomplete batch.\nFalse\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nset_epoch\nSets the epoch for this sampler.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler.set_epoch(epoch)\nSets the epoch for this sampler.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nepoch\nint\nEpoch number to use for shuffling.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.sampler.html#classes",
-    "href": "docs/api/core.trainers.grpo.sampler.html#classes",
-    "title": "core.trainers.grpo.sampler",
+    "objectID": "docs/api/kernels.utils.html",
+    "href": "docs/api/kernels.utils.html",
+    "title": "kernels.utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSequenceParallelRepeatRandomSampler\nSampler for GRPO training with sequence parallelism.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler(\n    dataset,\n    mini_repeat_count,\n    world_size,\n    rank,\n    batch_size=1,\n    repeat_count=1,\n    context_parallel_size=1,\n    shuffle=True,\n    seed=0,\n    drop_last=False,\n)\nSampler for GRPO training with sequence parallelism.\nThis sampler ensures:\n- Ranks in the same sequence parallel (SP) group receive identical data.\n- Each index is repeated multiple times for sampling different completions.\n- Entire batches are repeated for reuse in multiple updates.\n- Data is properly distributed across SP groups.\nIn the table below, the values represent dataset indices. Each SP group has\ncontext_parallel_size = 2 GPUs working together on the same data. There are 2\nSP groups (SP0 and SP1), with world_size = 4 total GPUs.\n                                       Sequence Parallel Groups\n                                |       SP0        |       SP1        |\n                                |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |\n            global_step  step    &lt;---&gt; mini_repeat_count=3\n                                    &lt;----------&gt; batch_size=2 per SP group\ngrad_accum=2 ▲ ▲ 0 0 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- SP groups get different data\n▼ | 0 1 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Same data for each SP group GPU\n|\n| 1 2 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Repeat same indices for iterations\nnum_iterations=2 ▼ 1 3 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- When using gradient accumulation\n                 2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   &lt;- New batch of data indices\n                 2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]\n                                    ...\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataset\nSized\nDataset to sample from.\nrequired\n\n\nmini_repeat_count\nint\nHow many times to repeat each sample immediately.\nrequired\n\n\nworld_size\nint\nTotal number of processes.\nrequired\n\n\nrank\nint\nRank of current process.\nrequired\n\n\nbatch_size\nint\nNumber of samples per batch.\n1\n\n\nrepeat_count\nint\nHow many times to repeat the full sampling process.\n1\n\n\ncontext_parallel_size\nint\nNumber of ranks in a sequence parallel group.\n1\n\n\nshuffle\nbool\nWhether to shuffle the dataset.\nTrue\n\n\nseed\nint\nRandom seed for shuffling.\n0\n\n\ndrop_last\nbool\nWhether to drop the last incomplete batch.\nFalse\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nset_epoch\nSets the epoch for this sampler.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler.set_epoch(epoch)\nSets the epoch for this sampler.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nepoch\nint\nEpoch number to use for shuffling.\nrequired"
+    "text": "kernels.utils\nkernels.utils\nUtilities for axolotl.kernels submodules."
   },
   {
-    "objectID": "docs/api/utils.lora.html",
-    "href": "docs/api/utils.lora.html",
-    "title": "utils.lora",
+    "objectID": "docs/api/prompt_strategies.base.html",
+    "href": "docs/api/prompt_strategies.base.html",
+    "title": "prompt_strategies.base",
     "section": "",
-    "text": "utils.lora\nmodule to get the state dict of a merged lora model\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_lora_merged_state_dict\nCreate and return a state_dict that has the LoRA deltas\n\n\n\n\n\nutils.lora.get_lora_merged_state_dict(model)\nCreate and return a state_dict that has the LoRA deltas\nmerged into the base model’s weights, without modifying model in place.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\ntorch.nn.Module\nA model that has LoRA/PEFT adapters attached.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndict\ndict\nA state_dict of the merged parameters."
+    "text": "prompt_strategies.base\nprompt_strategies.base\nmodule for base dataset transform strategies"
   },
   {
-    "objectID": "docs/api/utils.lora.html#functions",
-    "href": "docs/api/utils.lora.html#functions",
-    "title": "utils.lora",
+    "objectID": "docs/api/utils.callbacks.comet_.html",
+    "href": "docs/api/utils.callbacks.comet_.html",
+    "title": "utils.callbacks.comet_",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_lora_merged_state_dict\nCreate and return a state_dict that has the LoRA deltas\n\n\n\n\n\nutils.lora.get_lora_merged_state_dict(model)\nCreate and return a state_dict that has the LoRA deltas\nmerged into the base model’s weights, without modifying model in place.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\ntorch.nn.Module\nA model that has LoRA/PEFT adapters attached.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndict\ndict\nA state_dict of the merged parameters."
+    "text": "utils.callbacks.comet_\nComet module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoCometCallback\nCallback to save axolotl config to comet\n\n\n\n\n\nutils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(axolotl_config_path)\nCallback to save axolotl config to comet"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.optimizer.html",
-    "href": "docs/api/core.trainers.mixins.optimizer.html",
-    "title": "core.trainers.mixins.optimizer",
+    "objectID": "docs/api/utils.callbacks.comet_.html#classes",
+    "href": "docs/api/utils.callbacks.comet_.html#classes",
+    "title": "utils.callbacks.comet_",
     "section": "",
-    "text": "core.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nOptimizerInitMixin\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerInitMixin(*args, **kwargs)\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\naccept optimizer_cls_and_kwargs as kwarg in constructor.\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
+    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoCometCallback\nCallback to save axolotl config to comet\n\n\n\n\n\nutils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(axolotl_config_path)\nCallback to save axolotl config to comet"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.optimizer.html#classes",
-    "href": "docs/api/core.trainers.mixins.optimizer.html#classes",
-    "title": "core.trainers.mixins.optimizer",
+    "objectID": "docs/api/utils.schemas.multimodal.html",
+    "href": "docs/api/utils.schemas.multimodal.html",
+    "title": "utils.schemas.multimodal",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nOptimizerInitMixin\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerInitMixin(*args, **kwargs)\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\naccept optimizer_cls_and_kwargs as kwarg in constructor.\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
+    "text": "utils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
   },
   {
-    "objectID": "docs/api/cli.config.html",
-    "href": "docs/api/cli.config.html",
-    "title": "cli.config",
+    "objectID": "docs/api/utils.schemas.multimodal.html#classes",
+    "href": "docs/api/utils.schemas.multimodal.html#classes",
+    "title": "utils.schemas.multimodal",
     "section": "",
-    "text": "cli.config\nConfiguration loading and processing.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
   },
   {
-    "objectID": "docs/api/cli.config.html#functions",
-    "href": "docs/api/cli.config.html#functions",
-    "title": "cli.config",
+    "objectID": "docs/api/prompt_strategies.chat_template.html",
+    "href": "docs/api/prompt_strategies.chat_template.html",
+    "title": "prompt_strategies.chat_template",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "prompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n    real_last_index=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
   },
   {
-    "objectID": "docs/api/monkeypatch.multipack.html",
-    "href": "docs/api/monkeypatch.multipack.html",
-    "title": "monkeypatch.multipack",
+    "objectID": "docs/api/prompt_strategies.chat_template.html#classes",
+    "href": "docs/api/prompt_strategies.chat_template.html#classes",
+    "title": "prompt_strategies.chat_template",
     "section": "",
-    "text": "monkeypatch.multipack\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing"
+    "text": "Name\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n    real_last_index=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
   },
   {
-    "objectID": "docs/api/utils.collators.batching.html",
-    "href": "docs/api/utils.collators.batching.html",
-    "title": "utils.collators.batching",
+    "objectID": "docs/api/utils.callbacks.qat.html",
+    "href": "docs/api/utils.callbacks.qat.html",
+    "title": "utils.callbacks.qat",
     "section": "",
-    "text": "utils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\n\n\n\nName\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
+    "text": "utils.callbacks.qat\nQAT Callback for HF Causal Trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.collators.batching.html#classes",
-    "href": "docs/api/utils.collators.batching.html#classes",
-    "title": "utils.collators.batching",
+    "objectID": "docs/api/utils.callbacks.qat.html#classes",
+    "href": "docs/api/utils.callbacks.qat.html#classes",
+    "title": "utils.callbacks.qat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
+    "text": "Name\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model."
   },
   {
-    "objectID": "docs/api/utils.quantization.html",
-    "href": "docs/api/utils.quantization.html",
-    "title": "utils.quantization",
+    "objectID": "docs/api/utils.callbacks.qat.html#functions",
+    "href": "docs/api/utils.callbacks.qat.html#functions",
+    "title": "utils.callbacks.qat",
     "section": "",
-    "text": "utils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_qat_model\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\nget_quantization_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model\nThis function is used to quantize a model.\n\n\n\n\n\nutils.quantization.convert_qat_model(model, quantize_embedding=False)\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\n\nutils.quantization.get_quantization_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
+    "text": "Name\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.quantization.html#functions",
-    "href": "docs/api/utils.quantization.html#functions",
-    "title": "utils.quantization",
+    "objectID": "docs/api/utils.samplers.multipack.html",
+    "href": "docs/api/utils.samplers.multipack.html",
+    "title": "utils.samplers.multipack",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nconvert_qat_model\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\nget_quantization_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model\nThis function is used to quantize a model.\n\n\n\n\n\nutils.quantization.convert_qat_model(model, quantize_embedding=False)\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\n\nutils.quantization.get_quantization_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
+    "text": "utils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\ninto fixed-capacity batches to optimize memory usage and training throughput.\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    bin_size,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
   },
   {
-    "objectID": "docs/api/utils.dict.html",
-    "href": "docs/api/utils.dict.html",
-    "title": "utils.dict",
+    "objectID": "docs/api/utils.samplers.multipack.html#classes",
+    "href": "docs/api/utils.samplers.multipack.html#classes",
+    "title": "utils.samplers.multipack",
     "section": "",
-    "text": "utils.dict\nModule containing the DictDefault class\n\n\n\n\n\nName\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
+    "text": "Name\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    bin_size,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs"
   },
   {
-    "objectID": "docs/api/utils.dict.html#classes",
-    "href": "docs/api/utils.dict.html#classes",
-    "title": "utils.dict",
+    "objectID": "docs/api/utils.samplers.multipack.html#functions",
+    "href": "docs/api/utils.samplers.multipack.html#functions",
+    "title": "utils.samplers.multipack",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys."
+    "text": "Name\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
   },
   {
-    "objectID": "docs/api/utils.dict.html#functions",
-    "href": "docs/api/utils.dict.html#functions",
-    "title": "utils.dict",
+    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html",
+    "href": "docs/api/prompt_strategies.orpo.chat_template.html",
+    "title": "prompt_strategies.orpo.chat_template",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
+    "text": "prompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\n\n\n\nName\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_input_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_input_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
   },
   {
-    "objectID": "docs/api/kernels.quantize.html",
-    "href": "docs/api/kernels.quantize.html",
-    "title": "kernels.quantize",
+    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
+    "href": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
+    "title": "prompt_strategies.orpo.chat_template",
     "section": "",
-    "text": "kernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
+    "text": "Name\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_input_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_input_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels"
   },
   {
-    "objectID": "docs/api/kernels.quantize.html#functions",
-    "href": "docs/api/kernels.quantize.html#functions",
-    "title": "kernels.quantize",
+    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
+    "href": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
+    "title": "prompt_strategies.orpo.chat_template",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
+    "text": "Name\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
   },
   {
-    "objectID": "docs/api/utils.schemas.training.html",
-    "href": "docs/api/utils.schemas.training.html",
-    "title": "utils.schemas.training",
+    "objectID": "docs/api/core.chat.format.shared.html",
+    "href": "docs/api/core.chat.format.shared.html",
+    "title": "core.chat.format.shared",
     "section": "",
-    "text": "utils.schemas.training\nPydantic models for training hyperparameters\n\n\n\n\n\nName\nDescription\n\n\n\n\nHyperparametersConfig\nTraining hyperparams configuration subset\n\n\nJaggedLRConfig\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\nLrGroup\nCustom learning rate group configuration\n\n\n\n\n\nutils.schemas.training.HyperparametersConfig()\nTraining hyperparams configuration subset\n\n\n\nutils.schemas.training.JaggedLRConfig()\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\n\nutils.schemas.training.LrGroup()\nCustom learning rate group configuration"
+    "text": "core.chat.format.shared\ncore.chat.format.shared\nshared functions for format transforms"
   },
   {
-    "objectID": "docs/api/utils.schemas.training.html#classes",
-    "href": "docs/api/utils.schemas.training.html#classes",
-    "title": "utils.schemas.training",
+    "objectID": "docs/api/utils.collators.core.html",
+    "href": "docs/api/utils.collators.core.html",
+    "title": "utils.collators.core",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nHyperparametersConfig\nTraining hyperparams configuration subset\n\n\nJaggedLRConfig\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\nLrGroup\nCustom learning rate group configuration\n\n\n\n\n\nutils.schemas.training.HyperparametersConfig()\nTraining hyperparams configuration subset\n\n\n\nutils.schemas.training.JaggedLRConfig()\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\n\nutils.schemas.training.LrGroup()\nCustom learning rate group configuration"
+    "text": "utils.collators.core\nutils.collators.core\nbasic shared collator constants"
   },
   {
-    "objectID": "docs/api/train.html",
-    "href": "docs/api/train.html",
-    "title": "train",
+    "objectID": "docs/api/integrations.lm_eval.args.html",
+    "href": "docs/api/integrations.lm_eval.args.html",
+    "title": "integrations.lm_eval.args",
     "section": "",
-    "text": "train\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
+    "text": "integrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
   },
   {
-    "objectID": "docs/api/train.html#functions",
-    "href": "docs/api/train.html#functions",
-    "title": "train",
+    "objectID": "docs/api/integrations.lm_eval.args.html#classes",
+    "href": "docs/api/integrations.lm_eval.args.html#classes",
+    "title": "integrations.lm_eval.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
+    "text": "Name\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
   },
   {
-    "objectID": "docs/api/core.datasets.transforms.chat_builder.html",
-    "href": "docs/api/core.datasets.transforms.chat_builder.html",
-    "title": "core.datasets.transforms.chat_builder",
+    "objectID": "docs/api/core.trainers.grpo.trainer.html",
+    "href": "docs/api/core.trainers.grpo.trainer.html",
+    "title": "core.trainers.grpo.trainer",
     "section": "",
-    "text": "core.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\ndataset and converts it to a Chat.\n\n\n\n\n\nName\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='messages',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “messages”.\n'messages'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
+    "text": "core.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
   },
   {
-    "objectID": "docs/api/core.datasets.transforms.chat_builder.html#functions",
-    "href": "docs/api/core.datasets.transforms.chat_builder.html#functions",
-    "title": "core.datasets.transforms.chat_builder",
+    "objectID": "docs/api/core.trainers.grpo.trainer.html#classes",
+    "href": "docs/api/core.trainers.grpo.trainer.html#classes",
+    "title": "core.trainers.grpo.trainer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='messages',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “messages”.\n'messages'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
+    "text": "Name\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
   },
   {
-    "objectID": "docs/inference.html",
-    "href": "docs/inference.html",
-    "title": "Inference and Merging",
+    "objectID": "docs/api/prompt_strategies.dpo.chat_template.html",
+    "href": "docs/api/prompt_strategies.dpo.chat_template.html",
+    "title": "prompt_strategies.dpo.chat_template",
     "section": "",
-    "text": "This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
+    "text": "prompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nDPO chat template strategy for argilla-style datasets.\n\n\n\n\n\nprompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)\nDPO chat template strategy for argilla-style datasets.\nFor argilla-style datasets where chosen/rejected contain full conversations\ninstead of single response messages. Extracts the conversation history from\nthe chosen field and formats both chosen/rejected responses using the\nconfigured chat template.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nConfiguration object containing chat_template and dataset settings\nrequired\n\n\ndataset_idx\n\nIndex of the dataset in the config (default: 0)\n0\n\n\n**kwargs\n\nAdditional keyword arguments (unused)\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ntuple\n\n(transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with ‘remove_columns’ specifying columns to drop\n\n\n\n\n\n\n{\n“chosen”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n],\n“rejected”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n]\n}"
   },
   {
-    "objectID": "docs/inference.html#sec-quickstart",
-    "href": "docs/inference.html#sec-quickstart",
-    "title": "Inference and Merging",
-    "section": "1 Quick Start",
-    "text": "1 Quick Start\n\n\n\n\n\n\nTip\n\n\n\nUse the same config used for training on inference/merging.\n\n\n\n1.1 Basic Inference\n\nLoRA ModelsFull Fine-tuned Models\n\n\naxolotl inference your_config.yml --lora-model-dir=\"./lora-output-dir\"\n\n\naxolotl inference your_config.yml --base-model=\"./completed-model\"",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-advanced",
-    "href": "docs/inference.html#sec-advanced",
-    "title": "Inference and Merging",
-    "section": "2 Advanced Usage",
-    "text": "2 Advanced Usage\n\n2.1 Gradio Interface\nLaunch an interactive web interface:\naxolotl inference your_config.yml --gradio\n\n\n2.2 File-based Prompts\nProcess prompts from a text file:\ncat /tmp/prompt.txt | axolotl inference your_config.yml \\\n  --base-model=\"./completed-model\" --prompter=None\n\n\n2.3 Memory Optimization\nFor large models or limited memory:\naxolotl inference your_config.yml --load-in-8bit=True",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-merging",
-    "href": "docs/inference.html#sec-merging",
-    "title": "Inference and Merging",
-    "section": "3 Merging LoRA Weights",
-    "text": "3 Merging LoRA Weights\nMerge LoRA adapters with the base model:\naxolotl merge-lora your_config.yml --lora-model-dir=\"./completed-model\"\n\n3.1 Memory Management for Merging\n\nConfiguration OptionsForce CPU Merging\n\n\ngpu_memory_limit: 20GiB  # Adjust based on your GPU\nlora_on_cpu: true        # Process on CPU if needed\n\n\nCUDA_VISIBLE_DEVICES=\"\" axolotl merge-lora ...",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-tokenization",
-    "href": "docs/inference.html#sec-tokenization",
-    "title": "Inference and Merging",
-    "section": "4 Tokenization",
-    "text": "4 Tokenization\n\n4.1 Common Issues\n\n\n\n\n\n\nWarning\n\n\n\nTokenization mismatches between training and inference are a common source of problems.\n\n\nTo debug:\n\nCheck training tokenization:\n\naxolotl preprocess your_config.yml --debug\n\nVerify inference tokenization by decoding tokens before model input\nCompare token IDs between training and inference\n\n\n\n4.2 Special Tokens\nConfigure special tokens in your YAML:\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\ntokens:\n  - \"&lt;|im_start|&gt;\"\n  - \"&lt;|im_end|&gt;\"",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-troubleshooting",
-    "href": "docs/inference.html#sec-troubleshooting",
-    "title": "Inference and Merging",
-    "section": "5 Troubleshooting",
-    "text": "5 Troubleshooting\n\n5.1 Common Problems\n\nMemory IssuesToken IssuesPerformance Issues\n\n\n\nUse 8-bit loading\nReduce batch sizes\nTry CPU offloading\n\n\n\n\nVerify special tokens\nCheck tokenizer settings\nCompare training and inference preprocessing\n\n\n\n\nVerify model loading\nCheck prompt formatting\nEnsure temperature/sampling settings\n\n\n\n\nFor more details, see our debugging guide.",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "FAQS.html",
-    "href": "FAQS.html",
-    "title": "FAQs",
-    "section": "",
-    "text": "FAQs\n\nCan you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this PR\nWill this work with Deepspeed? That’s still a WIP, but setting export ACCELERATE_USE_DEEPSPEED=true should work in some cases\nError invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c\n/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.\nThis could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source."
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html",
-    "title": "Fine-Tune Qwen3 14B with Axolotl",
-    "section": "",
-    "text": "Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money."
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#demo-talk-like-a-pirate",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#demo-talk-like-a-pirate",
-    "title": "Fine-Tune Qwen3 14B with Axolotl",
-    "section": "Demo: Talk Like a Pirate",
-    "text": "Demo: Talk Like a Pirate\nIn this demo, we are training the model to respond like a pirate. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab.\n\nUpload your own dataset or use a Huggingface dataset\nYou can choose to use your own JSONL file from your own Google Drive; for example downloading the Pirate-Ultrachat JSONL to your Google Drive. JSONL datasets should be formatted similar to the OpenAI dataset format.\nYou can also simply use the winglian/pirate-ultrachat-10k dataset directly.\n\n# Default to HF dataset location\ndataset_id = \"winglian/pirate-ultrachat-10k\"\nuploaded = {}\n\n\nimport os\n\n# Optionally, upload your own JSONL to your Google Drive\nGOOGLE_DRIVE_PATH = \"\"  # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n\n# \"Select All\" permissions, or you may get the error:\n# \"MessageError: Error: credential propagation was unsuccessful\"\nif GOOGLE_DRIVE_PATH:\n    from google.colab import drive\n\n    # Mount your Google Drive\n    GOOGLE_DRIVE_MNT = \"/content/drive/\"\n    drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n    tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(\"/\"))\n    # make sure file exists\n    if not os.path.isfile(tmp_path):\n        raise ValueError(f\"File {tmp_path} does not exist\")\n    dataset_id = tmp_path"
-  },
-  {
-    "objectID": "index.html",
-    "href": "index.html",
-    "title": "Axolotl",
-    "section": "",
-    "text": "A Free and Open Source LLM Fine-tuning Framework",
-    "crumbs": [
-      "Home"
-    ]
-  },
-  {
-    "objectID": "index.html#latest-updates",
-    "href": "index.html#latest-updates",
-    "title": "Axolotl",
-    "section": "🎉 Latest Updates",
-    "text": "🎉 Latest Updates\n\n2025/12: Axolotl now includes support for Kimi-Linear, Plano-Orchestrator, MiMo, InternVL 3.5, Olmo3, Trinity, and Ministral3.\n2025/10: New model support has been added in Axolotl for: Qwen3 Next, Qwen2.5-vl, Qwen3-vl, Qwen3, Qwen3MoE, Granite 4, HunYuan, Magistral 2509, Apertus, and Seed-OSS.\n2025/09: Axolotl now has text diffusion training. Read more here.\n2025/08: QAT has been updated to include NVFP4 support. See PR.\n2025/07:\n\nND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the blog post for more info.\nAxolotl adds more models: GPT-OSS, Gemma 3n, Liquid Foundation Model 2 (LFM2), and Arcee Foundation Models (AFM).\nFP8 finetuning with fp8 gather op is now possible in Axolotl via torchao. Get started here!\nVoxtral, Magistral 1.1, and Devstral with mistral-common tokenizer support has been integrated in Axolotl!\nTiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See examples for using ALST with Axolotl!\n\n2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the docs to learn more!\n\n\n\nExpand older updates\n\n\n2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the blog and docs to learn how to scale your context length when fine-tuning.\n2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See docs to start training your own Magistral models with Axolotl!\n2025/04: Llama 4 support has been added in Axolotl. See docs to start training your own Llama 4 models with Axolotl’s linearized version!\n2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the docs to fine-tune your own!\n2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the docs to give it a try.\n2025/02: Axolotl has added GRPO support. Dive into our blog and GRPO example and have some fun!\n2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See docs.",
-    "crumbs": [
-      "Home"
-    ]
-  },
-  {
-    "objectID": "index.html#overview",
-    "href": "index.html#overview",
-    "title": "Axolotl",
-    "section": "✨ Overview",
-    "text": "✨ Overview\nAxolotl is a free and open-source tool designed to streamline post-training and fine-tuning for the latest large language models (LLMs).\nFeatures:\n\nMultiple Model Support: Train various models like GPT-OSS, LLaMA, Mistral, Mixtral, Pythia, and many more models available on the Hugging Face Hub.\nMultimodal Training: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, and audio models like Voxtral with image, video, and audio support.\nTraining Methods: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), and Reward Modelling (RM) / Process Reward Modelling (PRM).\nEasy Configuration: Re-use a single YAML configuration file across the full fine-tuning pipeline: dataset preprocessing, training, evaluation, quantization, and inference.\nPerformance Optimizations: Multipacking, Flash Attention, Xformers, Flex Attention, Liger Kernel, Cut Cross Entropy, Sequence Parallelism (SP), LoRA optimizations, Multi-GPU training (FSDP1, FSDP2, DeepSpeed), Multi-node training (Torchrun, Ray), and many more!\nFlexible Dataset Handling: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.\nCloud Ready: We ship Docker images and also PyPI packages for use on cloud platforms and local hardware.",
-    "crumbs": [
-      "Home"
-    ]
-  },
-  {
-    "objectID": "index.html#quick-start---llm-fine-tuning-in-minutes",
-    "href": "index.html#quick-start---llm-fine-tuning-in-minutes",
-    "title": "Axolotl",
-    "section": "🚀 Quick Start - LLM Fine-tuning in Minutes",
-    "text": "🚀 Quick Start - LLM Fine-tuning in Minutes\nRequirements:\n\nNVIDIA GPU (Ampere or newer for bf16 and Flash Attention) or AMD GPU\nPython 3.11\nPyTorch ≥2.8.0\n\n\nGoogle Colab\n\n\n\nOpen In Colab\n\n\n\n\nInstallation\n\nUsing pip\npip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\n\n# Download example axolotl configs, deepspeed configs\naxolotl fetch examples\naxolotl fetch deepspeed_configs  # OPTIONAL\n\n\nUsing Docker\nInstalling with Docker can be less error prone than installing in your own environment.\ndocker run --gpus '\"all\"' --rm -it axolotlai/axolotl:main-latest\nOther installation approaches are described here.\n\n\nCloud Providers\n\n\nRunPod\nVast.ai\nPRIME Intellect\nModal\nNovita\nJarvisLabs.ai\nLatitude.sh\n\n\n\n\n\nYour First Fine-tune\n# Fetch axolotl examples\naxolotl fetch examples\n\n# Or, specify a custom path\naxolotl fetch examples --dest path/to/folder\n\n# Train a model using LoRA\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Check out our Getting Started Guide for a more detailed walkthrough.",
-    "crumbs": [
-      "Home"
-    ]
-  },
-  {
-    "objectID": "index.html#documentation",
-    "href": "index.html#documentation",
-    "title": "Axolotl",
-    "section": "📚 Documentation",
-    "text": "📚 Documentation\n\nInstallation Options - Detailed setup instructions for different environments\nConfiguration Guide - Full configuration options and examples\nDataset Loading - Loading datasets from various sources\nDataset Guide - Supported formats and how to use them\nMulti-GPU Training\nMulti-Node Training\nMultipacking\nAPI Reference - Auto-generated code documentation\nFAQ - Frequently asked questions",
-    "crumbs": [
-      "Home"
-    ]
-  },
-  {
-    "objectID": "index.html#getting-help",
-    "href": "index.html#getting-help",
-    "title": "Axolotl",
-    "section": "🤝 Getting Help",
-    "text": "🤝 Getting Help\n\nJoin our Discord community for support\nCheck out our Examples directory\nRead our Debugging Guide\nNeed dedicated support? Please contact ✉️wing@axolotl.ai for options",
-    "crumbs": [
-      "Home"
-    ]
-  },
-  {
-    "objectID": "index.html#contributing",
-    "href": "index.html#contributing",
-    "title": "Axolotl",
-    "section": "🌟 Contributing",
-    "text": "🌟 Contributing\nContributions are welcome! Please see our Contributing Guide for details.",
-    "crumbs": [
-      "Home"
-    ]
-  },
-  {
-    "objectID": "index.html#telemetry",
-    "href": "index.html#telemetry",
-    "title": "Axolotl",
-    "section": "📈 Telemetry",
-    "text": "📈 Telemetry\nAxolotl has opt-out telemetry that helps us understand how the project is being used\nand prioritize improvements. We collect basic system information, model types, and\nerror rates—never personal data or file paths. Telemetry is enabled by default. To\ndisable it, set AXOLOTL_DO_NOT_TRACK=1. For more details, see our telemetry documentation.",
-    "crumbs": [
-      "Home"
-    ]
-  },
-  {
-    "objectID": "index.html#sponsors",
-    "href": "index.html#sponsors",
-    "title": "Axolotl",
-    "section": "❤️ Sponsors",
-    "text": "❤️ Sponsors\nInterested in sponsoring? Contact us at wing@axolotl.ai",
-    "crumbs": [
-      "Home"
-    ]
-  },
-  {
-    "objectID": "index.html#citing-axolotl",
-    "href": "index.html#citing-axolotl",
-    "title": "Axolotl",
-    "section": "📝 Citing Axolotl",
-    "text": "📝 Citing Axolotl\nIf you use Axolotl in your research or projects, please cite it as follows:\n@software{axolotl,\n  title = {Axolotl: Open Source LLM Post-Training},\n  author = {{Axolotl maintainers and contributors}},\n  url = {https://github.com/axolotl-ai-cloud/axolotl},\n  license = {Apache-2.0},\n  year = {2023}\n}",
-    "crumbs": [
-      "Home"
-    ]
-  },
-  {
-    "objectID": "index.html#license",
-    "href": "index.html#license",
-    "title": "Axolotl",
-    "section": "📜 License",
-    "text": "📜 License\nThis project is licensed under the Apache 2.0 License - see the LICENSE file for details.",
-    "crumbs": [
-      "Home"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html",
-    "href": "docs/custom_integrations.html",
-    "title": "Custom Integrations",
-    "section": "",
-    "text": "Axolotl adds custom features through integrations. They are located within the src/axolotl/integrations directory.\nTo enable them, please check the respective documentations.",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#cut-cross-entropy",
-    "href": "docs/custom_integrations.html#cut-cross-entropy",
-    "title": "Custom Integrations",
-    "section": "Cut Cross Entropy",
-    "text": "Cut Cross Entropy\nCut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.\nSee https://github.com/apple/ml-cross-entropy\n\nRequirements\n\nPyTorch 2.4.0 or higher\n\n\n\nInstallation\nRun the following command to install cut_cross_entropy[transformers] if you don’t have it already.\n\nIf you are in dev environment\n\npython scripts/cutcrossentropy_install.py | sh\n\nIf you are installing from pip\n\npip3 uninstall -y cut-cross-entropy && pip3 install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f4b5712\"\n\n\nUsage\nplugins:\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n\nSupported Models\n\napertus\narcee\ncohere\ncohere2\ndeepseek_v3\nexaone4\ngemma\ngemma2\ngemma3\ngemma3_text\ngemma3n\ngemma3n_text\nglm\nglm4\nglm4_moe\nglm4_moe_lite\nglm46v\nglm4v\nglm4v_moe\nglm_image\ngpt_oss\ngranite\ngranitemoe\ngranitemoeshared\ngranitemoehybrid\nhunyuan_v1_dense\nhunyuan_v1_moe\ninternvl\nkimi_linear\nlfm2\nlfm2_moe\nlfm2_vl\nllama\nllama4\nllama4_text\nllava\nministral\nministral3\nmistral\nmistral3\nmixtral\nmllama\nolmo\nolmo2\nolmo3\nphi\nphi3\nphi4_multimodal\nqwen2\nqwen2_vl\nqwen2_moe\nqwen2_5_vl\nqwen3\nqwen3_moe\nqwen3_vl\nqwen3_vl_moe\nqwen3_next\nsmollm3\nseed_oss\nvoxtral\n\n\n\nCitation\n@article{wijmans2024cut,\n  author       = {Erik Wijmans and\n                  Brody Huval and\n                  Alexander Hertzberg and\n                  Vladlen Koltun and\n                  Philipp Kr\\\"ahenb\\\"uhl},\n  title        = {Cut Your Losses in Large-Vocabulary Language Models},\n  journal      = {arXiv},\n  year         = {2024},\n  url          = {https://arxiv.org/abs/2411.09009},\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#densemixer",
-    "href": "docs/custom_integrations.html#densemixer",
-    "title": "Custom Integrations",
-    "section": "DenseMixer",
-    "text": "DenseMixer\nSee DenseMixer\nSimply add the following to your axolotl YAML config:\nplugins:\n  - axolotl.integrations.densemixer.DenseMixerPlugin\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#diffusion-lm-training-plugin-for-axolotl",
-    "href": "docs/custom_integrations.html#diffusion-lm-training-plugin-for-axolotl",
-    "title": "Custom Integrations",
-    "section": "Diffusion LM Training Plugin for Axolotl",
-    "text": "Diffusion LM Training Plugin for Axolotl\nThis plugin enables diffusion language model training using an approach inspired by\nLLaDA (Large Language Diffusion Models) within Axolotl.\n\nOverview\nLLaDA is a diffusion-based approach to language model training that uses:\n- Random token masking during training instead of next-token prediction\n- Bidirectional attention to allow the model to attend to the full context\n- Importance weighting based on masking probabilities for stable training\nThis approach can lead to more robust language models with better understanding of\nbidirectional context.\n\n\nInstallation\nThe plugin is included with Axolotl. See our\ninstallation docs.\n\n\nQuickstart\nTrain with an example config (Llama‑3.2 1B):\n- Pretrain: axolotl train examples/llama-3/diffusion-3.2-1b-pretrain.yaml\n- SFT: axolotl train examples/llama-3/diffusion-3.2-1b-sft.yaml\n\n\nBasic Configuration\nYou can also modify your existing configs to enable / customize diffusion training.\nAdd the following to your Axolotl config:\nplugins:\n  - axolotl.integrations.diffusion.DiffusionPlugin\nAnd, configure the nested diffusion block (defaults shown):\ndiffusion:\n  noise_schedule: linear  # or \"cosine\"\n  min_mask_ratio: 0.1\n  max_mask_ratio: 0.9\n  num_diffusion_steps: 128\n  eps: 1e-3\n  importance_weighting: true\n\n  # Mask token (training auto-adds if missing, avoid pad/eos)\n  mask_token_str: \"&lt;|diffusion_mask|&gt;\"\n  # Or use an existing special token id (e.g., 128002 for Llama-3.x)\n  # mask_token_id: 128002\n\n  # Sample generation during training (optional)\n  generate_samples: true\n  generation_interval: 100\n  num_generation_samples: 3\n  generation_steps: 128\n  generation_temperature: 0.0\n  generation_max_length: 100\n\n\nSupported Models\nAny models that support 4D attention masks should work out of the box. If not, please\ncreate an issue or open a\nPR!\n\n\nHow It Works\n\n\nRandom Masking\nDuring training, tokens are randomly masked:\n- Sample timestep t uniformly from [0, 1]\n- Calculate masking probability: p = (1 - eps) * t + eps\n- Randomly mask tokens with probability p\n\n\nDiffusion Loss\nLoss is computed only on masked tokens with (optional) importance weighting:\nloss = sum(cross_entropy(pred, target) / p_mask) / total_tokens\n\n\nSample Generation\nWhen diffusion.generate_samples: true, the plugin generates samples during training:\nSample 1:\n   Original (45 tokens): The quick brown fox jumps over the lazy dog...\n   Masked (18/45 tokens, 40.0%): The [MASK] [MASK] fox [MASK] over [MASK] lazy [MASK]...\n   Generated: The quick brown fox jumps over the lazy dog...\nSamples are logged to console and wandb (if enabled).\n\n\nInference\nDiffusion inference is integrated into the standard Axolotl CLI. Use the same config\nyou trained with and run:\naxolotl inference path/to/your-config.yaml\nOptionally, pass --gradio to use a simple web interface.\nInteractive controls (prefix the prompt with commands):\n- :complete N → completion mode with N new masked tokens appended (default 64)\n- :mask R → random masking mode with target mask ratio R in [0.0, 1.0]\nExample session:\n================================================================================\nCommands:\n:complete N -&gt; completion mode with N tokens (default 64)\n:mask R     -&gt; random masking with ratio R (0.0–1.0)\n================================================================================\nGive me an instruction (Ctrl + D to submit):\n\n:mask 0.4 The quick brown fox jumps over the lazy dog\n\nMasked (40.0%):\nThe [MASK] brown [MASK] jumps over the [MASK] dog\n\nGenerated:\nThe quick brown fox jumps over the loud dog\n\n\nMetrics and Monitoring\nThe plugin adds (or modifies) several metrics to track diffusion training:\n\ntrain/loss: Weighted diffusion loss\ntrain/accuracy: Accuracy on masked tokens\ntrain/mask_ratio: Average fraction of tokens masked\ntrain/num_masked_tokens: Number of tokens masked\ntrain/avg_p_mask: Average masking probability\ntrain/ce_loss: Unweighted cross-entropy loss\ntrain/importance_weight_avg: Average importance weight\n\n\n\nLimitations\n\nNo flash attention support\nNo RL training support\n\n\n\nReferences\n\nLLaDA Paper\nAxolotl Documentation\nAPI reference for plugin\n\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#grokfast",
-    "href": "docs/custom_integrations.html#grokfast",
-    "title": "Custom Integrations",
-    "section": "Grokfast",
-    "text": "Grokfast\nSee https://github.com/ironjr/grokfast\n\nUsage\nplugins:\n  - axolotl.integrations.grokfast.GrokfastPlugin\n\ngrokfast_alpha: 2.0\ngrokfast_lamb: 0.98\n\n\nCitation\n@article{lee2024grokfast,\n    title={{Grokfast}: Accelerated Grokking by Amplifying Slow Gradients},\n    author={Lee, Jaerin and Kang, Bong Gyun and Kim, Kihoon and Lee, Kyoung Mu},\n    journal={arXiv preprint arXiv:2405.20233},\n    year={2024}\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#knowledge-distillation-kd",
-    "href": "docs/custom_integrations.html#knowledge-distillation-kd",
-    "title": "Custom Integrations",
-    "section": "Knowledge Distillation (KD)",
-    "text": "Knowledge Distillation (KD)\n\nUsage\nplugins:\n  - \"axolotl.integrations.kd.KDPlugin\"\n\nkd_trainer: True\nkd_ce_alpha: 0.1\nkd_alpha: 0.9\nkd_temperature: 1.0\n\ntorch_compile: True  # torch&gt;=2.6.0, recommended to reduce vram\n\ndatasets:\n  - path: ...\n    type: \"axolotl.integrations.kd.chat_template\"\n    field_messages: \"messages_combined\"\n    logprobs_field: \"llm_text_generation_vllm_logprobs\"  # for kd only, field of logprobs\nAn example dataset can be found at axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#llmcompressor",
-    "href": "docs/custom_integrations.html#llmcompressor",
-    "title": "Custom Integrations",
-    "section": "LLMCompressor",
-    "text": "LLMCompressor\nFine-tune sparsified models in Axolotl using Neural Magic’s LLMCompressor.\nThis integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor’s model compression capabilities with Axolotl’s distributed training pipelines, users can efficiently fine-tune sparse models at scale.\nIt uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.\n\n\nRequirements\n\nAxolotl with llmcompressor extras:\npip install \"axolotl[llmcompressor]\"\nRequires llmcompressor &gt;= 0.5.1\n\nThis will install all necessary dependencies to fine-tune sparsified models using the integration.\n\n\n\nUsage\nTo enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:\nplugins:\n  - axolotl.integrations.llm_compressor.LLMCompressorPlugin\n\nllmcompressor:\n  recipe:\n    finetuning_stage:\n      finetuning_modifiers:\n        ConstantPruningModifier:\n          targets: [\n            're:.*q_proj.weight',\n            're:.*k_proj.weight',\n            're:.*v_proj.weight',\n            're:.*o_proj.weight',\n            're:.*gate_proj.weight',\n            're:.*up_proj.weight',\n            're:.*down_proj.weight',\n          ]\n          start: 0\n  save_compressed: true\nThis plugin does not apply pruning or sparsification itself — it is intended for fine-tuning models that have already been sparsified.\nPre-sparsified checkpoints can be:\n- Generated using LLMCompressor\n- Downloaded from Neural Magic’s Hugging Face page\n- Any custom LLM with compatible sparsity patterns that you’ve created yourself\nTo learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:\nhttps://github.com/vllm-project/llm-compressor/blob/main/README.md\n\n\nStorage Optimization with save_compressed\nSetting save_compressed: true in your configuration enables saving models in a compressed format, which:\n- Reduces disk space usage by approximately 40%\n- Maintains compatibility with vLLM for accelerated inference\n- Maintains compatibility with llmcompressor for further optimization (example: quantization)\nThis option is highly recommended when working with sparse models to maximize the benefits of model compression.\n\n\nExample Config\nSee examples/llama-3/sparse-finetuning.yaml for a complete example.\n\n\n\nInference with vLLM\nAfter fine-tuning your sparse model, you can leverage vLLM for efficient inference.\nYou can also use LLMCompressor to apply additional quantization to your fine-tuned\nsparse model before inference for even greater performance benefits.:\nfrom vllm import LLM, SamplingParams\n\nprompts = [\n    \"Hello, my name is\",\n    \"The president of the United States is\",\n    \"The capital of France is\",\n    \"The future of AI is\",\n]\nsampling_params = SamplingParams(temperature=0.8, top_p=0.95)\nllm = LLM(\"path/to/your/sparse/model\")\noutputs = llm.generate(prompts, sampling_params)\n\nfor output in outputs:\n    prompt = output.prompt\n    generated_text = output.outputs[0].text\n    print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\nFor more details on vLLM’s capabilities and advanced configuration options, see the official vLLM documentation.\n\n\nLearn More\nFor details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:\nhttps://github.com/vllm-project/llm-compressor\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
-    "href": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
-    "title": "Custom Integrations",
-    "section": "Language Model Evaluation Harness (LM Eval)",
-    "text": "Language Model Evaluation Harness (LM Eval)\nRun evaluation on model using the popular lm-evaluation-harness library.\nSee https://github.com/EleutherAI/lm-evaluation-harness\n\nUsage\nplugins:\n  - axolotl.integrations.lm_eval.LMEvalPlugin\n\nlm_eval_tasks:\n  - gsm8k\n  - hellaswag\n  - arc_easy\n\nlm_eval_batch_size: # Batch size for evaluation\noutput_dir: # Directory to save evaluation results\n\n\nCitation\n@misc{eval-harness,\n  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},\n  title        = {A framework for few-shot language model evaluation},\n  month        = 07,\n  year         = 2024,\n  publisher    = {Zenodo},\n  version      = {v0.4.3},\n  doi          = {10.5281/zenodo.12608602},\n  url          = {https://zenodo.org/records/12608602}\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#liger-kernels",
-    "href": "docs/custom_integrations.html#liger-kernels",
-    "title": "Custom Integrations",
-    "section": "Liger Kernels",
-    "text": "Liger Kernels\nLiger Kernel provides efficient Triton kernels for LLM training, offering:\n\n20% increase in multi-GPU training throughput\n60% reduction in memory usage\nCompatibility with both FSDP and DeepSpeed\n\nSee https://github.com/linkedin/Liger-Kernel\n\nUsage\nplugins:\n  - axolotl.integrations.liger.LigerPlugin\nliger_rope: true\nliger_rms_norm: true\nliger_glu_activation: true\nliger_layer_norm: true\nliger_fused_linear_cross_entropy: true\n\nliger_use_token_scaling: true\n\n\nSupported Models\n\ndeepseek_v2\ngemma\ngemma2\ngemma3\ngranite\njamba\nllama\nmistral\nmixtral\nmllama\nmllama_text_model\nolmo2\npaligemma\nphi3\nqwen2\nqwen2_5_vl\nqwen2_vl\n\n\n\nCitation\n@article{hsu2024ligerkernelefficienttriton,\n      title={Liger Kernel: Efficient Triton Kernels for LLM Training},\n      author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},\n      year={2024},\n      eprint={2410.10989},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2410.10989},\n      journal={arXiv preprint arXiv:2410.10989},\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#spectrum",
-    "href": "docs/custom_integrations.html#spectrum",
-    "title": "Custom Integrations",
-    "section": "Spectrum",
-    "text": "Spectrum\nby Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar\nThis plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR).\nSee https://github.com/cognitivecomputations/spectrum\n\nOverview\nSpectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models.\nBy identifying the top n% of layers with the highest SNR, you can optimize training efficiency.\n\n\nUsage\nplugins:\n  - axolotl.integrations.spectrum.SpectrumPlugin\n\nspectrum_top_fraction: 0.5\nspectrum_model_name: meta-llama/Meta-Llama-3.1-8B\n\n\nCitation\n@misc{hartford2024spectrumtargetedtrainingsignal,\n      title={Spectrum: Targeted Training on Signal to Noise Ratio},\n      author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},\n      year={2024},\n      eprint={2406.06623},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2406.06623},\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#swanlab-integration-for-axolotl",
-    "href": "docs/custom_integrations.html#swanlab-integration-for-axolotl",
-    "title": "Custom Integrations",
-    "section": "SwanLab Integration for Axolotl",
-    "text": "SwanLab Integration for Axolotl\nSwanLab is an open-source, lightweight AI experiment tracking and visualization tool that provides a platform for tracking, recording, comparing, and collaborating on experiments.\nThis integration enables seamless experiment tracking and visualization of Axolotl training runs using SwanLab.\n\nFeatures\n\n📊 Automatic Metrics Logging: Training loss, learning rate, and other metrics are automatically logged\n🎯 Hyperparameter Tracking: Model configuration and training parameters are tracked\n📈 Real-time Visualization: Monitor training progress in real-time through SwanLab dashboard\n☁️ Cloud & Local Support: Works in both cloud-synced and offline modes\n🔄 Experiment Comparison: Compare multiple training runs easily\n🤝 Team Collaboration: Share experiments with team members\n🎭 RLHF Completion Logging: Automatically log model outputs during DPO/KTO/ORPO/GRPO training for qualitative analysis\n⚡ Performance Profiling: Built-in profiling decorators to measure and optimize training performance\n🔔 Lark Notifications: Send real-time training updates to team chat (Feishu/Lark integration)\n\n\n\nInstallation\npip install swanlab\n\n\nQuick Start\n\n\n1. Register for SwanLab (Optional for cloud mode)\nIf you want to use cloud sync features, register at https://swanlab.cn to get your API key.\n\n\n2. Configure Axolotl Config File\nAdd SwanLab configuration to your Axolotl YAML config:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: my-llm-project\nswanlab_experiment_name: qwen-finetune-v1\nswanlab_mode: cloud  # Options: cloud, local, offline, disabled\nswanlab_workspace: my-team  # Optional: organization name\nswanlab_api_key: YOUR_API_KEY  # Optional: can also use env var SWANLAB_API_KEY\n\n\n3. Run Training\nexport SWANLAB_API_KEY=your-api-key-here\n\nswanlab login\n\naccelerate launch -m axolotl.cli.train your-config.yaml\n\n\nConfiguration Options\n\n\nBasic Configuration\n\n\n\n\n\n\n\n\n\nParameter\nType\nDefault\nDescription\n\n\n\n\nuse_swanlab\nbool\nfalse\nEnable SwanLab tracking\n\n\nswanlab_project\nstr\nNone\nProject name (required)\n\n\nswanlab_experiment_name\nstr\nNone\nExperiment name\n\n\nswanlab_description\nstr\nNone\nExperiment description\n\n\nswanlab_mode\nstr\ncloud\nSync mode: cloud, local, offline, disabled\n\n\n\n\n\nAdvanced Configuration\n\n\n\n\n\n\n\n\n\nParameter\nType\nDefault\nDescription\n\n\n\n\nswanlab_workspace\nstr\nNone\nWorkspace/organization name\n\n\nswanlab_api_key\nstr\nNone\nAPI key (prefer env var)\n\n\nswanlab_web_host\nstr\nNone\nPrivate deployment web host\n\n\nswanlab_api_host\nstr\nNone\nPrivate deployment API host\n\n\nswanlab_log_model\nbool\nfalse\nLog model checkpoints (coming soon)\n\n\nswanlab_lark_webhook_url\nstr\nNone\nLark (Feishu) webhook URL for team notifications\n\n\nswanlab_lark_secret\nstr\nNone\nLark webhook HMAC secret for authentication\n\n\nswanlab_log_completions\nbool\ntrue\nEnable RLHF completion table logging (DPO/KTO/ORPO/GRPO)\n\n\nswanlab_completion_log_interval\nint\n100\nSteps between completion logging\n\n\nswanlab_completion_max_buffer\nint\n128\nMax completions to buffer (memory bound)\n\n\n\n\n\nConfiguration Examples\n\n\nExample 1: Basic Cloud Sync\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: llama-finetune\nswanlab_experiment_name: llama-3-8b-instruct-v1\nswanlab_mode: cloud\n\n\nExample 2: Offline/Local Mode\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: local-experiments\nswanlab_experiment_name: test-run-1\nswanlab_mode: local  # or 'offline'\n\n\nExample 3: Team Workspace\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: research-project\nswanlab_experiment_name: experiment-42\nswanlab_workspace: my-research-team\nswanlab_mode: cloud\n\n\nExample 4: Private Deployment\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: internal-project\nswanlab_experiment_name: secure-training\nswanlab_mode: cloud\nswanlab_web_host: https://swanlab.yourcompany.com\nswanlab_api_host: https://api.swanlab.yourcompany.com\n\n\nTeam Notifications with Lark (Feishu)\nSwanLab supports sending real-time training notifications to your team chat via Lark (Feishu), ByteDance’s enterprise collaboration platform. This is especially useful for:\n- Production training monitoring: Get alerts when training starts, completes, or encounters errors\n- Team collaboration: Keep your ML team informed about long-running experiments\n- Multi-timezone teams: Team members can check training progress without being online\n\n\nPrerequisites\n\nLark Bot Setup: Create a custom bot in your Lark group chat\nWebhook URL: Get the webhook URL from your Lark bot settings\nHMAC Secret (recommended): Enable signature verification in your Lark bot for security\n\nFor detailed Lark bot setup instructions, see Lark Custom Bot Documentation.\n\n\nExample 5: Basic Lark Notifications\nSend training notifications to a Lark group chat:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: production-training\nswanlab_experiment_name: llama-3-finetune-v2\nswanlab_mode: cloud\n\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx\nNote: This configuration will work, but you’ll see a security warning recommending HMAC secret configuration.\n\n\nExample 6: Lark Notifications with HMAC Security (Recommended)\nFor production use, enable HMAC signature verification:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: production-training\nswanlab_experiment_name: llama-3-finetune-v2\nswanlab_mode: cloud\n\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx\nswanlab_lark_secret: your-webhook-secret-key\nWhy HMAC secret matters:\n- Prevents unauthorized parties from sending fake notifications to your Lark group\n- Ensures notifications genuinely come from your training jobs\n- Required for production deployments with sensitive training data\n\n\nExample 7: Team Workspace + Lark Notifications\nCombine team workspace collaboration with Lark notifications:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: research-project\nswanlab_experiment_name: multimodal-experiment-42\nswanlab_workspace: ml-research-team\nswanlab_mode: cloud\n\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx\nswanlab_lark_secret: your-webhook-secret-key\n\n\nWhat Notifications Are Sent?\nSwanLab’s Lark integration sends notifications for key training events:\n- Training Start: When your experiment begins\n- Training Complete: When training finishes successfully\n- Training Errors: If training crashes or encounters critical errors\n- Metric Milestones: Configurable alerts for metric thresholds (if configured in SwanLab)\nEach notification includes:\n- Experiment name and project\n- Training status\n- Key metrics (loss, learning rate)\n- Direct link to SwanLab dashboard\n\n\nLark Configuration Validation\nThe plugin validates your Lark configuration at startup:\n\n✅ Valid Configurations\nuse_swanlab: true\nswanlab_project: my-project\n\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx\nswanlab_lark_secret: your-secret\n\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx\n\n\n\nSecurity Best Practices\n\nAlways use HMAC secret in production:\nswanlab_lark_webhook_url: https://open.feishu.cn/...\nswanlab_lark_secret: your-secret-key  # ✅ Add this!\nStore secrets in environment variables (even better):\n# In your training script/environment\nexport SWANLAB_LARK_WEBHOOK_URL=\"https://open.feishu.cn/...\"\nexport SWANLAB_LARK_SECRET=\"your-secret-key\"\nThen in config:\n# SwanLab plugin will auto-detect environment variables\nuse_swanlab: true\nswanlab_project: my-project\n# Lark URL and secret read from env vars\nRotate webhook secrets periodically: Update your Lark bot’s secret every 90 days\nUse separate webhooks for dev/prod: Don’t mix development and production notifications\n\n\n\nDistributed Training\nLark notifications are automatically deduplicated in distributed training:\n- Only rank 0 sends notifications\n- Other GPU ranks skip Lark registration\n- Prevents duplicate messages in multi-GPU training\ntorchrun --nproc_per_node=4 -m axolotl.cli.train config.yml\n\n\nRLHF Completion Table Logging\nFor RLHF (Reinforcement Learning from Human Feedback) training methods like DPO, KTO, ORPO, and GRPO, SwanLab can log model completions (prompts, chosen/rejected responses, rewards) to a visual table for qualitative analysis. This helps you:\n\nInspect model behavior: See actual model outputs during training\nDebug preference learning: Compare chosen vs rejected responses\nTrack reward patterns: Monitor how rewards evolve over training\nShare examples with team: Visual tables in SwanLab dashboard\n\n\n\nFeatures\n\n✅ Automatic detection: Works with DPO, KTO, ORPO, GRPO trainers\n✅ Memory-safe buffering: Bounded buffer prevents memory leaks in long training runs\n✅ Periodic logging: Configurable logging interval to reduce overhead\n✅ Rich visualization: SwanLab tables show prompts, responses, and metrics side-by-side\n\n\n\nConfiguration\n\n\n\n\n\n\n\n\n\nParameter\nType\nDefault\nDescription\n\n\n\n\nswanlab_log_completions\nbool\ntrue\nEnable completion logging for RLHF trainers\n\n\nswanlab_completion_log_interval\nint\n100\nLog completions to SwanLab every N training steps\n\n\nswanlab_completion_max_buffer\nint\n128\nMaximum completions to buffer (memory bound)\n\n\n\n\n\nExample: DPO Training with Completion Logging\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: dpo-training\nswanlab_experiment_name: llama-3-dpo-v1\nswanlab_mode: cloud\n\nswanlab_log_completions: true\nswanlab_completion_log_interval: 100  # Log every 100 steps\nswanlab_completion_max_buffer: 128    # Keep last 128 completions\n\nrl: dpo\ndatasets:\n  - path: /path/to/preference_dataset\n    type: chatml.intel\n\n\nExample: Disable Completion Logging\nIf you’re doing a quick test run or don’t need completion tables:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: dpo-training\n\nswanlab_log_completions: false\n\n\nSupported RLHF Trainers\nThe completion logging callback automatically activates for these trainer types:\n\nDPO (Direct Preference Optimization): Logs prompts, chosen, rejected, reward_diff\nKTO (Kahneman-Tversky Optimization): Logs prompts, completions, labels, rewards\nORPO (Odds Ratio Preference Optimization): Logs prompts, chosen, rejected, log_odds_ratio\nGRPO (Group Relative Policy Optimization): Logs prompts, completions, rewards, advantages\nCPO (Constrained Policy Optimization): Logs prompts, chosen, rejected\n\nFor non-RLHF trainers (standard supervised fine-tuning), the completion callback is automatically skipped.\n\n\nHow It Works\n\nAuto-detection: Plugin detects trainer type at initialization\nBuffering: Completions are buffered in memory (up to swanlab_completion_max_buffer)\nPeriodic logging: Every swanlab_completion_log_interval steps, buffer is logged to SwanLab\nMemory safety: Old completions are automatically dropped when buffer is full (uses collections.deque)\nFinal flush: Remaining completions are logged when training completes\n\n\n\nViewing Completion Tables\nAfter training starts, you can view completion tables in your SwanLab dashboard:\n\nNavigate to your experiment in SwanLab\nLook for the “rlhf_completions” table in the metrics panel\nThe table shows:\n\nstep: Training step when completion was generated\nprompt: Input prompt\nchosen: Preferred response (DPO/ORPO)\nrejected: Non-preferred response (DPO/ORPO)\ncompletion: Model output (KTO/GRPO)\nreward_diff/reward: Reward metrics\nTrainer-specific metrics (e.g., log_odds_ratio for ORPO)\n\n\n\n\nMemory Management\nThe completion buffer is memory-bounded to prevent memory leaks:\nfrom collections import deque\n\nbuffer = deque(maxlen=128)  # Old completions automatically dropped\nMemory usage estimate:\n- Average completion: ~500 characters (prompt + responses)\n- Buffer size 128: ~64 KB (negligible)\n- Buffer size 1024: ~512 KB (still small)\nRecommendation: Default buffer size (128) works well for most cases. Increase to 512-1024 only if you need to review more historical completions.\n\n\nPerformance Impact\nCompletion logging has minimal overhead:\n\nBuffering: O(1) append operation, negligible CPU/memory\nLogging: Only happens every N steps (default: 100)\nNetwork: SwanLab batches table uploads efficiently\n\nExpected overhead: &lt; 0.5% per training step\n\n\nTroubleshooting\n\nCompletions not appearing in SwanLab\nCause: Trainer may not be logging completion data in the expected format.\nDiagnostic steps:\n1. Check trainer type detection in logs:\ntext    INFO: SwanLab RLHF completion logging enabled for DPOTrainer (type: dpo)\n2. Verify your trainer is an RLHF trainer (DPO/KTO/ORPO/GRPO)\n3. Check if trainer logs completion data (this depends on TRL version)\nNote: The current implementation expects trainers to log completion data in the logs dict during on_log() callback. Some TRL trainers may not expose this data by default. You may need to patch the trainer to expose completions.\n\n\nBuffer fills up too quickly\nCause: High logging frequency with small buffer size.\nSolution: Increase buffer size or logging interval:\nswanlab_completion_log_interval: 200  # Log less frequently\nswanlab_completion_max_buffer: 512    # Larger buffer\n\n\nMemory usage growing over time\nCause: Buffer should be bounded, so this indicates a bug.\nSolution:\n1. Verify swanlab_completion_max_buffer is set\n2. Check SwanLab version is up to date\n3. Report issue with memory profiling data\n\n\n\nPerformance Profiling\nSwanLab integration includes profiling utilities to measure and log execution time of trainer methods. This helps you:\n\nIdentify bottlenecks: Find slow operations in your training loop\nOptimize performance: Track improvements after optimization changes\nMonitor distributed training: See per-rank timing differences\nDebug hangs: Detect methods that take unexpectedly long\n\n\n\nFeatures\n\n✅ Zero-config profiling: Automatic timing of key trainer methods\n✅ Decorator-based: Easy to add profiling to custom methods with @swanlab_profile\n✅ Context manager: Fine-grained profiling with swanlab_profiling_context()\n✅ Advanced filtering: ProfilingConfig for throttling and minimum duration thresholds\n✅ Exception-safe: Logs duration even if function raises an exception\n\n\n\nBasic Usage: Decorator\nAdd profiling to any trainer method with the @swanlab_profile decorator:\nfrom axolotl.integrations.swanlab.profiling import swanlab_profile\n\nclass MyCustomTrainer(AxolotlTrainer):\n    @swanlab_profile\n    def training_step(self, model, inputs):\n        # Your training step logic\n        return super().training_step(model, inputs)\n\n    @swanlab_profile\n    def prediction_step(self, model, inputs, prediction_loss_only):\n        # Your prediction logic\n        return super().prediction_step(model, inputs, prediction_loss_only)\nThe decorator automatically:\n1. Measures execution time with high-precision timer\n2. Logs to SwanLab as profiling/Time taken: ClassName.method_name\n3. Only logs if SwanLab is enabled (use_swanlab: true)\n4. Gracefully handles exceptions (logs duration, then re-raises)\n\n\nAdvanced Usage: Context Manager\nFor fine-grained profiling within a method:\nfrom axolotl.integrations.swanlab.profiling import swanlab_profiling_context\n\nclass MyTrainer(AxolotlTrainer):\n    def complex_training_step(self, model, inputs):\n        # Profile just the forward pass\n        with swanlab_profiling_context(self, \"forward_pass\"):\n            outputs = model(**inputs)\n\n        # Profile just the backward pass\n        with swanlab_profiling_context(self, \"backward_pass\"):\n            loss = outputs.loss\n            loss.backward()\n\n        return outputs\n\n\nAdvanced Usage: ProfilingConfig\nFilter and throttle profiling logs with ProfilingConfig:\nfrom axolotl.integrations.swanlab.profiling import (\n    swanlab_profiling_context_advanced,\n    ProfilingConfig,\n)\n\nprofiling_config = ProfilingConfig(\n    enabled=True,\n    min_duration_ms=1.0,    # Only log if duration &gt; 1ms\n    log_interval=10,        # Log every 10th call\n)\n\nclass MyTrainer(AxolotlTrainer):\n    def frequently_called_method(self, data):\n        with swanlab_profiling_context_advanced(\n            self,\n            \"frequent_op\",\n            config=profiling_config\n        ):\n            # This only logs every 10th call, and only if it takes &gt; 1ms\n            result = expensive_computation(data)\n        return result\nProfilingConfig Parameters:\n- enabled: Enable/disable profiling globally (default: True)\n- min_duration_ms: Minimum duration to log in milliseconds (default: 0.1)\n- log_interval: Log every Nth function call (default: 1 = log all)\nUse cases:\n- High-frequency methods: Use log_interval=100 to reduce logging overhead\n- Filter noise: Use min_duration_ms=1.0 to skip very fast operations\n- Debugging: Use log_interval=1, min_duration_ms=0.0 to log everything\n\n\nViewing Profiling Metrics\nIn your SwanLab dashboard, profiling metrics appear under the “profiling” namespace:\nprofiling/Time taken: AxolotlTrainer.training_step\nprofiling/Time taken: AxolotlTrainer.prediction_step\nprofiling/Time taken: MyTrainer.forward_pass\nprofiling/Time taken: MyTrainer.backward_pass\nYou can:\n- Track over time: See if methods get faster/slower during training\n- Compare runs: Compare profiling metrics across experiments\n- Identify regressions: Detect if a code change slowed down training\n\n\nConfiguration in Axolotl Config\nProfiling is automatically enabled when SwanLab is enabled. No additional config needed:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: my-project\nTo disable profiling while keeping SwanLab enabled:\nfrom axolotl.integrations.swanlab.profiling import DEFAULT_PROFILING_CONFIG\n\nDEFAULT_PROFILING_CONFIG.enabled = False\n\n\nPerformance Impact\n\nDecorator overhead: ~2-5 microseconds per call (negligible)\nContext manager overhead: ~1-3 microseconds (negligible)\nLogging overhead: Only when SwanLab is enabled and method duration exceeds threshold\nNetwork overhead: SwanLab batches metrics efficiently\n\nExpected overhead: &lt; 0.1% per training step (effectively zero)\n\n\nBest Practices\n\nProfile bottlenecks first: Start by profiling suspected slow operations\nUse min_duration_ms: Filter out fast operations (&lt; 1ms) to reduce noise\nThrottle high-frequency calls: Use log_interval for methods called &gt; 100 times/step\nProfile across runs: Compare profiling metrics before/after optimization\nMonitor distributed training: Check for rank-specific slowdowns\n\n\n\nExample: Complete Profiling Setup\nfrom axolotl.integrations.swanlab.profiling import (\n    swanlab_profile,\n    swanlab_profiling_context,\n    ProfilingConfig,\n)\n\nclass OptimizedTrainer(AxolotlTrainer):\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        # Custom profiling config for high-frequency operations\n        self.fast_op_config = ProfilingConfig(\n            enabled=True,\n            min_duration_ms=0.5,\n            log_interval=50,\n        )\n\n    @swanlab_profile\n    def training_step(self, model, inputs):\n        \"\"\"Main training step - always profile.\"\"\"\n        return super().training_step(model, inputs)\n\n    @swanlab_profile\n    def compute_loss(self, model, inputs, return_outputs=False):\n        \"\"\"Loss computation - always profile.\"\"\"\n        return super().compute_loss(model, inputs, return_outputs)\n\n    def _prepare_inputs(self, inputs):\n        \"\"\"High-frequency operation - throttled profiling.\"\"\"\n        with swanlab_profiling_context_advanced(\n            self,\n            \"prepare_inputs\",\n            config=self.fast_op_config,\n        ):\n            return super()._prepare_inputs(inputs)\n\n\nTroubleshooting\n\nProfiling metrics not appearing in SwanLab\nCause: SwanLab is not enabled or not initialized.\nSolution:\nuse_swanlab: true\nswanlab_project: my-project\nCheck logs for:\nINFO: SwanLab initialized for project: my-project\n\n\nToo many profiling metrics cluttering dashboard\nCause: Profiling every function call for high-frequency operations.\nSolution: Use ProfilingConfig with throttling:\nconfig = ProfilingConfig(\n    min_duration_ms=1.0,    # Skip fast ops\n    log_interval=100,       # Log every 100th call\n)\n\n\nProfiling overhead impacting training speed\nCause: Profiling itself should have negligible overhead (&lt; 0.1%). If you see &gt; 1% slowdown, this indicates a bug.\nSolution:\n1. Disable profiling temporarily to confirm:\npython    DEFAULT_PROFILING_CONFIG.enabled = False\n2. Report issue with profiling data and trainer details\n\n\nProfiling shows inconsistent timing\nCause: Normal variation due to GPU warmup, data loading, or system load.\nSolution:\n- Ignore first few steps (warmup period)\n- Look at average/median timing over many steps\n- Use log_interval to reduce noise from individual outliers\n\n\n\nComplete Config Example\nHere’s a complete example integrating SwanLab with your RVQ-Alpha training:\nbase_model: /path/to/your/model\nmodel_type: Qwen2ForCausalLM\n\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\nuse_swanlab: true\nswanlab_project: RVQ-Alpha-Training\nswanlab_experiment_name: Qwen2.5-7B-MetaQA-Perturb-P020\nswanlab_description: \"Training on MetaQA and Perturbation datasets with NEW-RVQ encoding\"\nswanlab_mode: cloud\nswanlab_workspace: single-cell-genomics\n\nsequence_len: 32768\nmicro_batch_size: 1\ngradient_accumulation_steps: 1\nnum_epochs: 2\nlearning_rate: 2e-5\noptimizer: adamw_torch_fused\n\ndatasets:\n  - path: /path/to/dataset\n    type: chat_template\n\noutput_dir: ./outputs\n\n\nModes Explained\n\n\ncloud Mode (Default)\n\nSyncs experiments to SwanLab cloud in real-time\nRequires API key and internet connection\nBest for: Team collaboration, remote monitoring\n\n\n\nlocal Mode\n\nSaves experiments locally only\nNo cloud sync\nBest for: Local development, air-gapped environments\n\n\n\noffline Mode\n\nSaves metadata locally\nCan sync to cloud later using swanlab sync\nBest for: Unstable internet, sync later\n\n\n\ndisabled Mode\n\nTurns off SwanLab completely\nNo logging or tracking\nBest for: Debugging, testing\n\n\n\nConfiguration Validation & Conflict Detection\nSwanLab integration includes comprehensive validation and conflict detection to help you catch configuration errors early and avoid performance issues.\n\n\nRequired Fields Validation\nThe plugin validates your configuration at startup and provides clear error messages with solutions:\n\nMissing Project Name\nuse_swanlab: true\nSolution:\nuse_swanlab: true\nswanlab_project: my-project\n\n\nInvalid Mode\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_mode: invalid-mode\nSolution:\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_mode: cloud  # or: local, offline, disabled\n\n\nEmpty Project Name\nuse_swanlab: true\nswanlab_project: \"\"\nSolution:\nuse_swanlab: true\nswanlab_project: my-project\n\n\n\nCloud Mode API Key Warning\nWhen using cloud mode without an API key, you’ll receive a warning with multiple solutions:\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_mode: cloud\nSolutions:\n1. Set environment variable: export SWANLAB_API_KEY=your-api-key\n2. Add to config (less secure): swanlab_api_key: your-api-key\n3. Run swanlab login before training\n4. Use swanlab_mode: local for offline tracking\n\n\nMulti-Logger Performance Warnings\nUsing multiple logging tools simultaneously (SwanLab + WandB + MLflow + Comet) can impact training performance:\n\nTwo Loggers - Warning\nuse_swanlab: true\nswanlab_project: my-project\n\nuse_wandb: true\nwandb_project: my-project\nImpact:\n- Performance overhead: ~1-2% per logger (cumulative)\n- Increased memory usage\n- Longer training time per step\n- Potential config/callback conflicts\nRecommendations:\n- Choose ONE primary logging tool for production training\n- Use multiple loggers only for:\n- Migration period (transitioning between tools)\n- Short comparison runs\n- Debugging specific tool issues\n- Monitor system resources (CPU, memory) during training\n\n\nThree+ Loggers - Error-Level Warning\nuse_swanlab: true\nswanlab_project: my-project\n\nuse_wandb: true\nwandb_project: my-project\n\nuse_mlflow: true\nmlflow_tracking_uri: http://localhost:5000\nWhy This Matters:\n- With 3 loggers: ~4-5% overhead per step → significant slowdown over long training\n- Example: 10,000 steps at 2s/step → ~400-500 seconds extra (6-8 minutes)\n- Memory overhead scales with number of loggers\n- Rare edge cases with callback ordering conflicts\n\n\n\nAuto-Enable Logic\nFor convenience, SwanLab will auto-enable if you specify a project without setting use_swanlab:\nswanlab_project: my-project\n\nuse_swanlab: true\nswanlab_project: my-project\n\n\nDistributed Training Detection\nIn distributed training scenarios (multi-GPU), the plugin automatically detects and reports:\nuse_swanlab: true\nswanlab_project: my-project\nswanlab_mode: cloud\nWhy Only Rank 0:\n- Avoids duplicate experiment runs\n- Reduces network/cloud API overhead on worker ranks\n- Prevents race conditions in metric logging\n\n\nAuthentication\n\n\nMethod 1: Environment Variable (Recommended)\nexport SWANLAB_API_KEY=your-api-key-here\n\n\nMethod 2: Login Command\nswanlab login\n\n\nMethod 3: Config File\nswanlab_api_key: your-api-key-here\n\n\nWhat Gets Logged?\n\n\nAutomatically Logged Metrics\n\nTraining loss\nLearning rate\nGradient norm\nTraining steps\nEpoch progress\n\n\n\nAutomatically Logged Config\n\nModel configuration (base_model, model_type)\nTraining hyperparameters (learning_rate, batch_size, etc.)\nOptimizer settings\nParallelization settings (FSDP, DeepSpeed, Context Parallel)\nAxolotl configuration file\nDeepSpeed configuration (if used)\n\n\n\nViewing Your Experiments\n\n\nCloud Mode\nVisit https://swanlab.cn and navigate to your project to view:\n- Real-time training metrics\n- Hyperparameter comparison\n- System resource usage\n- Configuration files\n\n\nLocal Mode\nswanlab watch ./swanlog\n\n\nIntegration with Existing Tools\nSwanLab can work alongside other tracking tools:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin\n\nuse_swanlab: true\nswanlab_project: my-project\n\nuse_wandb: true\nwandb_project: my-project\n\n\nTroubleshooting\n\n\nConfiguration Errors\n\nError: “SwanLab enabled but ‘swanlab_project’ is not set”\nCause: You enabled SwanLab (use_swanlab: true) but forgot to specify a project name.\nSolution:\nuse_swanlab: true\nswanlab_project: my-project  # Add this line\n\n\nError: “Invalid swanlab_mode: ‘xxx’”\nCause: You provided an invalid mode value.\nSolution: Use one of the valid modes:\nswanlab_mode: cloud     # or: local, offline, disabled\n\n\nError: “swanlab_project cannot be an empty string”\nCause: You set swanlab_project: \"\" (empty string).\nSolution: Either provide a valid name or remove the field:\nswanlab_project: my-project\n\n\n\nImport Errors\n\nError: “SwanLab is not installed”\nCause: SwanLab package is not installed in your environment.\nSolution:\npip install swanlab\npip install swanlab&gt;=0.3.0\n\n\n\nPerformance Issues\n\nWarning: “Multiple logging tools enabled”\nCause: You have multiple experiment tracking tools enabled (e.g., SwanLab + WandB + MLflow).\nImpact: ~1-2% performance overhead per logger, cumulative.\nSolution: For production training, disable all but one logger:\nuse_swanlab: true\nswanlab_project: my-project\nuse_wandb: false      # Disable others\nuse_mlflow: false\n\nuse_swanlab: false\nuse_wandb: true\nwandb_project: my-project\nException: Multiple loggers are acceptable for:\n- Short comparison runs (&lt; 100 steps)\n- Migration testing between logging tools\n- Debugging logger-specific issues\n\n\n\nDistributed Training Issues\n\nSwanLab creates duplicate runs in multi-GPU training\nCause: All ranks are initializing SwanLab instead of just rank 0.\nExpected Behavior: The plugin automatically ensures only rank 0 initializes SwanLab. You should see:\nInfo: Distributed training detected (world_size=4)\nInfo: Only rank 0 will initialize SwanLab\nInfo: Other ranks will skip SwanLab to avoid conflicts\nIf you see duplicates:\n1. Check your plugin is loaded correctly\n2. Verify you’re using the latest SwanLab integration code\n3. Check logs for initialization messages on all ranks\n\n\n\nSwanLab not logging metrics\nSolution: Ensure SwanLab is initialized before training starts. The plugin automatically handles this in pre_model_load.\n\n\nAPI Key errors\nSolution:\necho $SWANLAB_API_KEY\n\nswanlab login\n\n\nCloud sync issues\nSolution: Use offline mode and sync later:\nswanlab_mode: offline\nThen sync when ready:\nswanlab sync ./swanlog\n\n\nPlugin not loaded\nSolution: Verify plugin path in config:\nplugins:\n  - axolotl.integrations.swanlab.SwanLabPlugin  # Correct path\n\n\nLark Notification Issues\n\nError: “Failed to import SwanLab Lark plugin”\nCause: Your SwanLab version doesn’t include the Lark plugin (requires SwanLab &gt;= 0.3.0).\nSolution:\npip install --upgrade swanlab\n\npip install 'swanlab&gt;=0.3.0'\n\n\nWarning: “Lark webhook has no secret configured”\nCause: You provided swanlab_lark_webhook_url but no swanlab_lark_secret.\nImpact: Lark notifications will work, but without HMAC authentication (security risk).\nSolution: Add HMAC secret for production use:\nswanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx\nswanlab_lark_secret: your-webhook-secret  # Add this line\nWhen it’s OK to skip secret:\n- Local development and testing\n- Internal networks with restricted access\n- Non-sensitive training experiments\nWhen secret is required:\n- Production training jobs\n- Training with proprietary data\n- Multi-team shared Lark groups\n\n\nError: “Failed to register Lark callback”\nCause: Invalid webhook URL or network connectivity issues.\nDiagnostic steps:\ncurl -X POST \"YOUR_WEBHOOK_URL\" \\\n  -H 'Content-Type: application/json' \\\n  -d '{\"msg_type\":\"text\",\"content\":{\"text\":\"Test from Axolotl\"}}'\n\npip show swanlab\nSolution:\n1. Verify webhook URL is correct (copy from Lark bot settings)\n2. Check network connectivity to Lark API\n3. Ensure webhook is not expired (Lark webhooks can expire)\n4. Regenerate webhook URL in Lark bot settings if needed\n\n\nLark notifications not received\nCause: Multiple possible causes.\nDiagnostic checklist:\n\nCheck training logs for Lark registration confirmation:\n# Expected log message (rank 0 only):\nINFO: Registered Lark notification callback with HMAC authentication\nVerify webhook in Lark: Test webhook manually (see above)\nCheck distributed training: Only rank 0 sends notifications\n# If running multi-GPU, check rank 0 logs specifically\ngrep \"Registered Lark\" logs/rank_0.log\nVerify SwanLab is initialized: Lark callback needs SwanLab to be running\nuse_swanlab: true  # Must be enabled\nswanlab_project: my-project  # Must be set\nCheck Lark bot permissions: Ensure bot is added to the target group chat\n\n\n\nDuplicate Lark notifications in multi-GPU training\nExpected Behavior: Should NOT happen - only rank 0 sends notifications.\nIf you see duplicates:\n1. Check that all GPUs are using the same config file\n2. Verify plugin is loaded correctly on all ranks\n3. Check logs for unexpected Lark initialization on non-zero ranks\n4. Ensure RANK or LOCAL_RANK environment variables are set correctly\nSolution: This is a bug if it occurs. Report with:\n- Full training command\n- Logs from all ranks\n- Config file\n\n\n\nComparison: SwanLab vs WandB\n\n\n\nFeature\nSwanLab\nWandB\n\n\n\n\nOpen Source\n✅ Yes\n❌ No\n\n\nSelf-Hosting\n✅ Easy\n⚠️ Complex\n\n\nFree Tier\n✅ Generous\n⚠️ Limited\n\n\nChinese Support\n✅ Native\n⚠️ Limited\n\n\nOffline Mode\n✅ Full support\n✅ Supported\n\n\nIntegration\n🆕 New\n✅ Mature\n\n\n\n\n\nAdvanced Usage\n\n\nCustom Logging\nYou can add custom metrics in your callbacks:\nimport swanlab\n\nswanlab.log({\n    \"custom_metric\": value,\n    \"epoch\": epoch_num\n})\n\n\nExperiment Comparison\nswanlab compare run1 run2 run3\n\n\nSupport\n\nDocumentation: https://docs.swanlab.cn\nGitHub: https://github.com/SwanHubX/SwanLab\nIssues: Report bugs at GitHub Issues\n\n\n\nLicense\nThis integration follows the Axolotl Community License Agreement.\n\n\nAcknowledgements\nThis integration is built on top of:\n- SwanLab - Experiment tracking tool\n- Transformers - SwanLabCallback\n- Axolotl - Training framework\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#adding-a-new-integration",
-    "href": "docs/custom_integrations.html#adding-a-new-integration",
-    "title": "Custom Integrations",
-    "section": "Adding a new integration",
-    "text": "Adding a new integration\nPlugins can be used to customize the behavior of the training pipeline through hooks. See axolotl.integrations.BasePlugin for the possible hooks.\nTo add a new integration, please follow these steps:\n\nCreate a new folder in the src/axolotl/integrations directory.\nAdd any relevant files (LICENSE, README.md, ACKNOWLEDGEMENTS.md, etc.) to the new folder.\nAdd __init__.py and args.py files to the new folder.\n\n\n__init__.py should import the integration and hook into the appropriate functions.\nargs.py should define the arguments for the integration.\n\n\n(If applicable) Add CPU tests under tests/integrations or GPU tests under tests/e2e/integrations.\n\n\n\n\n\n\n\nTip\n\n\n\nSee src/axolotl/integrations/cut_cross_entropy for a minimal integration example.\n\n\n\n\n\n\n\n\nWarning\n\n\n\nIf you could not load your integration, please ensure you are pip installing in editable mode.\npip install -e .\nand correctly spelled the integration name in the config file.\nplugins:\n  - axolotl.integrations.your_integration_name.YourIntegrationPlugin\n\n\n\n\n\n\n\n\nNote\n\n\n\nIt is not necessary to place your integration in the integrations folder. It can be in any location, so long as it’s installed in a package in your python env.\nSee this repo for an example: https://github.com/axolotl-ai-cloud/diff-transformer",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/api/utils.schemas.utils.html",
-    "href": "docs/api/utils.schemas.utils.html",
-    "title": "utils.schemas.utils",
-    "section": "",
-    "text": "utils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
-  },
-  {
-    "objectID": "docs/api/utils.schemas.utils.html#functions",
-    "href": "docs/api/utils.schemas.utils.html#functions",
-    "title": "utils.schemas.utils",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
-  },
-  {
-    "objectID": "docs/api/kernels.geglu.html",
-    "href": "docs/api/kernels.geglu.html",
-    "title": "kernels.geglu",
-    "section": "",
-    "text": "kernels.geglu\nModule for definition of GEGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
-  },
-  {
-    "objectID": "docs/api/kernels.geglu.html#functions",
-    "href": "docs/api/kernels.geglu.html#functions",
-    "title": "kernels.geglu",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
-  },
-  {
-    "objectID": "docs/api/core.builders.causal.html",
-    "href": "docs/api/core.builders.causal.html",
-    "title": "core.builders.causal",
-    "section": "",
-    "text": "core.builders.causal\nBuilder for causal trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
-  },
-  {
-    "objectID": "docs/api/core.builders.causal.html#classes",
-    "href": "docs/api/core.builders.causal.html#classes",
-    "title": "core.builders.causal",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
-  },
-  {
-    "objectID": "docs/api/core.trainers.mamba.html",
-    "href": "docs/api/core.trainers.mamba.html",
-    "title": "core.trainers.mamba",
-    "section": "",
-    "text": "core.trainers.mamba\nModule for mamba trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
-  },
-  {
-    "objectID": "docs/api/core.trainers.mamba.html#classes",
-    "href": "docs/api/core.trainers.mamba.html#classes",
-    "title": "core.trainers.mamba",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
-  },
-  {
-    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html",
-    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html",
-    "title": "prompt_strategies.bradley_terry.llama3",
-    "section": "",
-    "text": "prompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
-  },
-  {
-    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
-    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
-    "title": "prompt_strategies.bradley_terry.llama3",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
-  },
-  {
-    "objectID": "docs/api/core.datasets.chat.html",
-    "href": "docs/api/core.datasets.chat.html",
-    "title": "core.datasets.chat",
-    "section": "",
-    "text": "core.datasets.chat\nchat dataset module\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
-  },
-  {
-    "objectID": "docs/api/core.datasets.chat.html#classes",
-    "href": "docs/api/core.datasets.chat.html#classes",
-    "title": "core.datasets.chat",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
-  },
-  {
-    "objectID": "docs/api/utils.collators.mm_chat.html",
-    "href": "docs/api/utils.collators.mm_chat.html",
-    "title": "utils.collators.mm_chat",
-    "section": "",
-    "text": "utils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
-  },
-  {
-    "objectID": "docs/api/utils.collators.mm_chat.html#classes",
-    "href": "docs/api/utils.collators.mm_chat.html#classes",
-    "title": "utils.collators.mm_chat",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
-  },
-  {
-    "objectID": "docs/api/prompt_strategies.llama2_chat.html",
-    "href": "docs/api/prompt_strategies.llama2_chat.html",
-    "title": "prompt_strategies.llama2_chat",
-    "section": "",
-    "text": "prompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\nsee also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.\nThis implementation is based on the Vicuna PR and the fastchat repo, see also:\nhttps://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847\nUse dataset type: “llama2_chat” in conig.yml to use this prompt style.\nE.g. in the config.yml:\ndatasets:\n  - path: llama_finetune_train.jsonl\n    type: llama2_chat\nThe dataset itself should look like this:\n{'conversations':[{\"from\": \"human\", \"value\": \"Who are you?\"}, {\"from\": \"gpt\", \"value\": \"I am Vicuna\"},...]}\nin a jsonl file. The first message should be from the human, the second from gpt.\nFor a custom system message, the first “from” can be “system” (followed by alternating “human” and “gpt” turns).\nImportant: Don’t use “special_tokens:” in your config.yml if you are not sure what you are doing!\n\n\n\n\n\nName\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
-  },
-  {
-    "objectID": "docs/api/prompt_strategies.llama2_chat.html#classes",
-    "href": "docs/api/prompt_strategies.llama2_chat.html#classes",
-    "title": "prompt_strategies.llama2_chat",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
-  },
-  {
-    "objectID": "docs/api/common.const.html",
-    "href": "docs/api/common.const.html",
-    "title": "common.const",
-    "section": "",
-    "text": "common.const\ncommon.const\nVarious shared constants"
-  },
-  {
-    "objectID": "docs/api/cli.quantize.html",
-    "href": "docs/api/cli.quantize.html",
-    "title": "cli.quantize",
-    "section": "",
-    "text": "cli.quantize\nCLI to post-training quantize a model using torchao\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_quantize\nQuantizes a model’s model’s weights\n\n\n\n\n\ncli.quantize.do_quantize(config, cli_args)\nQuantizes a model’s model’s weights\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nThe path to the config file\nrequired\n\n\ncli_args\ndict\nAdditional command-line arguments\nrequired"
-  },
-  {
-    "objectID": "docs/api/cli.quantize.html#functions",
-    "href": "docs/api/cli.quantize.html#functions",
-    "title": "cli.quantize",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_quantize\nQuantizes a model’s model’s weights\n\n\n\n\n\ncli.quantize.do_quantize(config, cli_args)\nQuantizes a model’s model’s weights\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nThe path to the config file\nrequired\n\n\ncli_args\ndict\nAdditional command-line arguments\nrequired"
-  },
-  {
-    "objectID": "docs/api/utils.trainer.html",
-    "href": "docs/api/utils.trainer.html",
-    "title": "utils.trainer",
-    "section": "",
-    "text": "utils.trainer\nModule containing the Trainer class and related functions\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\ndrop_long_seq\nDrop samples whose sequence length is either too long (&gt; sequence_len)\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.drop_long_seq(\n    sample,\n    sequence_len=2048,\n    min_sequence_len=2,\n    raise_on_drop=False,\n)\nDrop samples whose sequence length is either too long (&gt; sequence_len)\nor too short (&lt; min_sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\nIf raise_on_drop is set, the code raises a ValueError if a sample is\nencountered that is too long and would have been dropped.\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
-  },
-  {
-    "objectID": "docs/api/utils.trainer.html#functions",
-    "href": "docs/api/utils.trainer.html#functions",
-    "title": "utils.trainer",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\ndrop_long_seq\nDrop samples whose sequence length is either too long (&gt; sequence_len)\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.drop_long_seq(\n    sample,\n    sequence_len=2048,\n    min_sequence_len=2,\n    raise_on_drop=False,\n)\nDrop samples whose sequence length is either too long (&gt; sequence_len)\nor too short (&lt; min_sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\nIf raise_on_drop is set, the code raises a ValueError if a sample is\nencountered that is too long and would have been dropped.\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
-  },
-  {
-    "objectID": "docs/api/cli.delinearize_llama4.html",
-    "href": "docs/api/cli.delinearize_llama4.html",
-    "title": "cli.delinearize_llama4",
-    "section": "",
-    "text": "cli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
-  },
-  {
-    "objectID": "docs/api/cli.delinearize_llama4.html#functions",
-    "href": "docs/api/cli.delinearize_llama4.html#functions",
-    "title": "cli.delinearize_llama4",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
-  },
-  {
-    "objectID": "docs/api/evaluate.html",
-    "href": "docs/api/evaluate.html",
-    "title": "evaluate",
-    "section": "",
-    "text": "evaluate\nModule for evaluating models.\n\n\n\n\n\nName\nDescription\n\n\n\n\nevaluate\nEvaluate a model on training and validation datasets.\n\n\nevaluate_dataset\nHelper function to evaluate a single dataset.\n\n\n\n\n\nevaluate.evaluate(cfg, dataset_meta)\nEvaluate a model on training and validation datasets.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nDataset metadata containing training and evaluation datasets.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDict[str, float]\nDictionary mapping metric names to their values.\n\n\n\n\n\n\n\nevaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)\nHelper function to evaluate a single dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer instance.\nrequired\n\n\ndataset\nDataset\nDataset to evaluate.\nrequired\n\n\ndataset_type\nstr\nType of dataset (‘train’ or ‘eval’).\nrequired\n\n\nflash_optimum\nbool\nWhether to use flash optimum.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptional[Dict[str, float]]\nDictionary of metrics or None if dataset is None."
-  },
-  {
-    "objectID": "docs/api/evaluate.html#functions",
-    "href": "docs/api/evaluate.html#functions",
-    "title": "evaluate",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nevaluate\nEvaluate a model on training and validation datasets.\n\n\nevaluate_dataset\nHelper function to evaluate a single dataset.\n\n\n\n\n\nevaluate.evaluate(cfg, dataset_meta)\nEvaluate a model on training and validation datasets.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nDataset metadata containing training and evaluation datasets.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDict[str, float]\nDictionary mapping metric names to their values.\n\n\n\n\n\n\n\nevaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)\nHelper function to evaluate a single dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer instance.\nrequired\n\n\ndataset\nDataset\nDataset to evaluate.\nrequired\n\n\ndataset_type\nstr\nType of dataset (‘train’ or ‘eval’).\nrequired\n\n\nflash_optimum\nbool\nWhether to use flash optimum.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptional[Dict[str, float]]\nDictionary of metrics or None if dataset is None."
-  },
-  {
-    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
-    "title": "monkeypatch.mistral_attn_hijack_flash",
-    "section": "",
-    "text": "monkeypatch.mistral_attn_hijack_flash\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model"
-  },
-  {
-    "objectID": "docs/api/loaders.model.html",
-    "href": "docs/api/loaders.model.html",
-    "title": "loaders.model",
-    "section": "",
-    "text": "loaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelLoader\nManages model configuration, initialization and application of patches during\n\n\n\n\n\nloaders.model.ModelLoader(\n    cfg,\n    tokenizer,\n    *,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nManages model configuration, initialization and application of patches during\nmodel loading.\nThis class orchestrates the entire process of loading a model from configuration to\nfinal preparation. It handles device mapping, quantization, attention mechanisms,\nadapter integration, and various optimizations.\n\n\n\nLoading and validating model configuration\nApplying monkey patches for optimizations / fixes\nSetting up device mapping (including multi-GPU configurations)\nConfiguring quantization\nSetting attention mechanisms (Flash Attention, SDPA, etc.)\nLoading and initializing the model\nApplying adapters (LoRA, QLoRA, etc.)\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nmodel\nPreTrainedModel | PeftModel | PeftMixedModel\nThe loaded model instance (available after load() is called).\n\n\nmodel_kwargs\ndict[str, Any]\nDictionary of keyword arguments passed to model initialization.\n\n\nbase_model\n\nName or path of the base model to load.\n\n\nmodel_type\n\nType of model to load (e.g., AutoModelForCausalLM).\n\n\nmodel_config\n\nConfiguration object for the model.\n\n\nauto_model_loader\n\nclass used for loading the model (default: AutoModelForCausalLM).\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\nloaders.model.ModelLoader.load()\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]\nA tuple with the loaded model and its LoRA configuration (if applicable)."
-  },
-  {
-    "objectID": "docs/api/loaders.model.html#classes",
-    "href": "docs/api/loaders.model.html#classes",
-    "title": "loaders.model",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nModelLoader\nManages model configuration, initialization and application of patches during\n\n\n\n\n\nloaders.model.ModelLoader(\n    cfg,\n    tokenizer,\n    *,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nManages model configuration, initialization and application of patches during\nmodel loading.\nThis class orchestrates the entire process of loading a model from configuration to\nfinal preparation. It handles device mapping, quantization, attention mechanisms,\nadapter integration, and various optimizations.\n\n\n\nLoading and validating model configuration\nApplying monkey patches for optimizations / fixes\nSetting up device mapping (including multi-GPU configurations)\nConfiguring quantization\nSetting attention mechanisms (Flash Attention, SDPA, etc.)\nLoading and initializing the model\nApplying adapters (LoRA, QLoRA, etc.)\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nmodel\nPreTrainedModel | PeftModel | PeftMixedModel\nThe loaded model instance (available after load() is called).\n\n\nmodel_kwargs\ndict[str, Any]\nDictionary of keyword arguments passed to model initialization.\n\n\nbase_model\n\nName or path of the base model to load.\n\n\nmodel_type\n\nType of model to load (e.g., AutoModelForCausalLM).\n\n\nmodel_config\n\nConfiguration object for the model.\n\n\nauto_model_loader\n\nclass used for loading the model (default: AutoModelForCausalLM).\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\nloaders.model.ModelLoader.load()\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]\nA tuple with the loaded model and its LoRA configuration (if applicable)."
-  },
-  {
-    "objectID": "docs/api/utils.distributed.html",
-    "href": "docs/api/utils.distributed.html",
-    "title": "utils.distributed",
-    "section": "",
-    "text": "utils.distributed\nUtilities for distributed functionality.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbarrier\nActs as a barrier to wait for all processes. This ensures that all processes\n\n\ncleanup_distributed\nDestroy process group if torch distributed is initialized. Called in training early\n\n\ncompute_and_broadcast\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\n\n\ngather_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\ngather_scalar_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\nis_distributed\nCheck if distributed training is initialized.\n\n\nis_main_process\nCheck if the current process is the main process. If not in distributed mode,\n\n\nreduce_and_broadcast\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\n\n\nzero_first\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\n\n\n\nutils.distributed.barrier()\nActs as a barrier to wait for all processes. This ensures that all processes\nreach the barrier before proceeding further.\n\n\n\nutils.distributed.cleanup_distributed()\nDestroy process group if torch distributed is initialized. Called in training early\ntermination or when training successfully completes.\n\n\n\nutils.distributed.compute_and_broadcast(fn)\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\nThe value is then broadcasted to all other ranks.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that computes the value. Default is 0.\nReturns:\n- The computed value (int or float).\n\n\n\nutils.distributed.gather_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.gather_scalar_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.is_distributed()\nCheck if distributed training is initialized.\n\n\n\nutils.distributed.is_main_process()\nCheck if the current process is the main process. If not in distributed mode,\nalways return True.\nWe use a simpler logic when the distributed state is not initialized: we just log\non the 0-th local rank.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if the current process is the main process, False otherwise.\n\n\n\n\n\n\n\nutils.distributed.reduce_and_broadcast(fn1, fn2)\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\nand then broadcast the reduced result to all ranks.\nArgs:\n- fn1 (callable): A function that computes the value on each rank.\n- fn2 (callable): A reduction function that takes a list of values and returns a single value.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- The reduced and broadcasted value.\n\n\n\nutils.distributed.zero_first(is_main)\nruns the wrapped context so that rank 0 runs first before other ranks"
-  },
-  {
-    "objectID": "docs/api/utils.distributed.html#functions",
-    "href": "docs/api/utils.distributed.html#functions",
-    "title": "utils.distributed",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nbarrier\nActs as a barrier to wait for all processes. This ensures that all processes\n\n\ncleanup_distributed\nDestroy process group if torch distributed is initialized. Called in training early\n\n\ncompute_and_broadcast\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\n\n\ngather_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\ngather_scalar_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\nis_distributed\nCheck if distributed training is initialized.\n\n\nis_main_process\nCheck if the current process is the main process. If not in distributed mode,\n\n\nreduce_and_broadcast\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\n\n\nzero_first\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\n\n\n\nutils.distributed.barrier()\nActs as a barrier to wait for all processes. This ensures that all processes\nreach the barrier before proceeding further.\n\n\n\nutils.distributed.cleanup_distributed()\nDestroy process group if torch distributed is initialized. Called in training early\ntermination or when training successfully completes.\n\n\n\nutils.distributed.compute_and_broadcast(fn)\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\nThe value is then broadcasted to all other ranks.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that computes the value. Default is 0.\nReturns:\n- The computed value (int or float).\n\n\n\nutils.distributed.gather_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.gather_scalar_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.is_distributed()\nCheck if distributed training is initialized.\n\n\n\nutils.distributed.is_main_process()\nCheck if the current process is the main process. If not in distributed mode,\nalways return True.\nWe use a simpler logic when the distributed state is not initialized: we just log\non the 0-th local rank.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if the current process is the main process, False otherwise.\n\n\n\n\n\n\n\nutils.distributed.reduce_and_broadcast(fn1, fn2)\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\nand then broadcast the reduced result to all ranks.\nArgs:\n- fn1 (callable): A function that computes the value on each rank.\n- fn2 (callable): A reduction function that takes a list of values and returns a single value.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- The reduced and broadcasted value.\n\n\n\nutils.distributed.zero_first(is_main)\nruns the wrapped context so that rank 0 runs first before other ranks"
-  },
-  {
-    "objectID": "docs/api/utils.model_shard_quant.html",
-    "href": "docs/api/utils.model_shard_quant.html",
-    "title": "utils.model_shard_quant",
-    "section": "",
-    "text": "utils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
-  },
-  {
-    "objectID": "docs/api/utils.model_shard_quant.html#functions",
-    "href": "docs/api/utils.model_shard_quant.html#functions",
-    "title": "utils.model_shard_quant",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
-  },
-  {
-    "objectID": "docs/api/kernels.lora.html",
-    "href": "docs/api/kernels.lora.html",
-    "title": "kernels.lora",
-    "section": "",
-    "text": "kernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\nSee “LoRA: Low-Rank Adaptation of Large Language Models”\n(https://arxiv.org/abs/2106.09685).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/biases/quantization states\n\n\n\nNone\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\ntorch.Tensor | None\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_bias,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_bias,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_bias,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_bias\ntorch.Tensor | None\nGate projection bias\nrequired\n\n\ngate_quant\nQuantState | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp projection weight\nrequired\n\n\nup_quant\nQuantState | None\nUp projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown projection weight\nrequired\n\n\ndown_bias\ntorch.Tensor | None\nDown projection bias\nrequired\n\n\ndown_quant\nQuantState | None\nDown projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, b, W_quant, A, B, s)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nb\ntorch.Tensor\nOutput projection bias\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection result\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_bias,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_bias,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_bias,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_bias\ntorch.Tensor | None\nQuery projection bias\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_bias\ntorch.Tensor | None\nKey projection bias\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_bias\ntorch.Tensor | None\nValue projection bias\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weights, quantization state, LoRA A and B weights,\n\n\n\ntorch.Tensor | None\nscaling factor, and base layer bias. Quant state, weights, and bias may be\n\n\n\nQuantState | None\nNone if not available.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, b, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
-  },
-  {
-    "objectID": "docs/api/kernels.lora.html#classes",
-    "href": "docs/api/kernels.lora.html#classes",
-    "title": "kernels.lora",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/biases/quantization states\n\n\n\nNone\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\ntorch.Tensor | None\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_bias,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_bias,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_bias,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_bias\ntorch.Tensor | None\nGate projection bias\nrequired\n\n\ngate_quant\nQuantState | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp projection weight\nrequired\n\n\nup_quant\nQuantState | None\nUp projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown projection weight\nrequired\n\n\ndown_bias\ntorch.Tensor | None\nDown projection bias\nrequired\n\n\ndown_quant\nQuantState | None\nDown projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, b, W_quant, A, B, s)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nb\ntorch.Tensor\nOutput projection bias\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection result\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_bias,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_bias,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_bias,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_bias\ntorch.Tensor | None\nQuery projection bias\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_bias\ntorch.Tensor | None\nKey projection bias\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_bias\ntorch.Tensor | None\nValue projection bias\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors"
-  },
-  {
-    "objectID": "docs/api/kernels.lora.html#functions",
-    "href": "docs/api/kernels.lora.html#functions",
-    "title": "kernels.lora",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weights, quantization state, LoRA A and B weights,\n\n\n\ntorch.Tensor | None\nscaling factor, and base layer bias. Quant state, weights, and bias may be\n\n\n\nQuantState | None\nNone if not available.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, b, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
-  },
-  {
-    "objectID": "docs/api/cli.main.html",
-    "href": "docs/api/cli.main.html",
-    "title": "cli.main",
-    "section": "",
-    "text": "cli.main\nClick CLI definitions for various axolotl commands.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
-  },
-  {
-    "objectID": "docs/api/cli.main.html#functions",
-    "href": "docs/api/cli.main.html#functions",
-    "title": "cli.main",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
-  },
-  {
-    "objectID": "docs/api/integrations.spectrum.args.html",
-    "href": "docs/api/integrations.spectrum.args.html",
-    "title": "integrations.spectrum.args",
-    "section": "",
-    "text": "integrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
-  },
-  {
-    "objectID": "docs/api/integrations.spectrum.args.html#classes",
-    "href": "docs/api/integrations.spectrum.args.html#classes",
-    "title": "integrations.spectrum.args",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
-  },
-  {
-    "objectID": "docs/api/utils.optimizers.adopt.html",
-    "href": "docs/api/utils.optimizers.adopt.html",
-    "title": "utils.optimizers.adopt",
-    "section": "",
-    "text": "utils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\nADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024)\nTaniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka\n\n\n\n\n\nName\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
-  },
-  {
-    "objectID": "docs/api/utils.optimizers.adopt.html#functions",
-    "href": "docs/api/utils.optimizers.adopt.html#functions",
-    "title": "utils.optimizers.adopt",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
-  },
-  {
-    "objectID": "docs/api/cli.cloud.modal_.html",
-    "href": "docs/api/cli.cloud.modal_.html",
-    "title": "cli.cloud.modal_",
-    "section": "",
-    "text": "cli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\nName\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
-  },
-  {
-    "objectID": "docs/api/cli.cloud.modal_.html#classes",
-    "href": "docs/api/cli.cloud.modal_.html#classes",
-    "title": "cli.cloud.modal_",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation."
-  },
-  {
-    "objectID": "docs/api/cli.cloud.modal_.html#functions",
-    "href": "docs/api/cli.cloud.modal_.html#functions",
-    "title": "cli.cloud.modal_",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
-  },
-  {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
-    "title": "monkeypatch.llama_attn_hijack_flash",
-    "section": "",
-    "text": "monkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\n\n\n\nName\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
-  },
-  {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
-    "title": "monkeypatch.llama_attn_hijack_flash",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
-  },
-  {
-    "objectID": "docs/api/core.builders.base.html",
-    "href": "docs/api/core.builders.base.html",
-    "title": "core.builders.base",
-    "section": "",
-    "text": "core.builders.base\nBase class for trainer builder\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
-  },
-  {
-    "objectID": "docs/api/core.builders.base.html#classes",
-    "href": "docs/api/core.builders.base.html#classes",
-    "title": "core.builders.base",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
-  },
-  {
-    "objectID": "docs/api/utils.schemas.trl.html",
-    "href": "docs/api/utils.schemas.trl.html",
-    "title": "utils.schemas.trl",
-    "section": "",
-    "text": "utils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
-  },
-  {
-    "objectID": "docs/api/utils.schemas.trl.html#classes",
-    "href": "docs/api/utils.schemas.trl.html#classes",
-    "title": "utils.schemas.trl",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
-  },
-  {
-    "objectID": "docs/api/cli.utils.args.html",
-    "href": "docs/api/cli.utils.args.html",
-    "title": "cli.utils.args",
-    "section": "",
-    "text": "cli.utils.args\nUtilities for axolotl CLI args.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\n\n\n\ncli.utils.args.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function."
-  },
-  {
-    "objectID": "docs/api/cli.utils.args.html#functions",
-    "href": "docs/api/cli.utils.args.html#functions",
-    "title": "cli.utils.args",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\n\n\n\ncli.utils.args.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function."
-  },
-  {
-    "objectID": "docs/api/core.trainers.base.html",
-    "href": "docs/api/core.trainers.base.html",
-    "title": "core.trainers.base",
-    "section": "",
-    "text": "core.trainers.base\nModule for customized trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\nstore_metrics\nStore metrics with specified reduction type.\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.\n\n\n\ncore.trainers.base.AxolotlTrainer.store_metrics(\n    metrics,\n    train_eval='train',\n    reduction='mean',\n)\nStore metrics with specified reduction type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmetrics\ndict[str, float] | dict[str, tuple[int | float, str]]\nDictionary of metric names to values, or metric names to (value, reduction_type) tuples.\nrequired\n\n\ntrain_eval\nLiteral['train', 'eval']\nWhether this is for training or evaluation.\n'train'"
-  },
-  {
-    "objectID": "docs/api/core.trainers.base.html#classes",
-    "href": "docs/api/core.trainers.base.html#classes",
-    "title": "core.trainers.base",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\nstore_metrics\nStore metrics with specified reduction type.\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.\n\n\n\ncore.trainers.base.AxolotlTrainer.store_metrics(\n    metrics,\n    train_eval='train',\n    reduction='mean',\n)\nStore metrics with specified reduction type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmetrics\ndict[str, float] | dict[str, tuple[int | float, str]]\nDictionary of metric names to values, or metric names to (value, reduction_type) tuples.\nrequired\n\n\ntrain_eval\nLiteral['train', 'eval']\nWhether this is for training or evaluation.\n'train'"
-  },
-  {
-    "objectID": "docs/api/monkeypatch.llama_patch_multipack.html",
-    "href": "docs/api/monkeypatch.llama_patch_multipack.html",
-    "title": "monkeypatch.llama_patch_multipack",
-    "section": "",
-    "text": "monkeypatch.llama_patch_multipack\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention"
-  },
-  {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
-    "title": "monkeypatch.llama_attn_hijack_xformers",
-    "section": "",
-    "text": "monkeypatch.llama_attn_hijack_xformers\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments"
-  },
-  {
-    "objectID": "docs/api/utils.schemas.model.html",
-    "href": "docs/api/utils.schemas.model.html",
-    "title": "utils.schemas.model",
-    "section": "",
-    "text": "utils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
-  },
-  {
-    "objectID": "docs/api/utils.schemas.model.html#classes",
-    "href": "docs/api/utils.schemas.model.html#classes",
-    "title": "utils.schemas.model",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
-  },
-  {
-    "objectID": "docs/api/prompt_strategies.kto.chatml.html",
-    "href": "docs/api/prompt_strategies.kto.chatml.html",
-    "title": "prompt_strategies.kto.chatml",
-    "section": "",
-    "text": "prompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
-  },
-  {
-    "objectID": "docs/api/prompt_strategies.kto.chatml.html#functions",
-    "href": "docs/api/prompt_strategies.kto.chatml.html#functions",
-    "title": "prompt_strategies.kto.chatml",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
-  },
-  {
-    "objectID": "docs/api/utils.callbacks.mlflow_.html",
-    "href": "docs/api/utils.callbacks.mlflow_.html",
-    "title": "utils.callbacks.mlflow_",
-    "section": "",
-    "text": "utils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
-  },
-  {
-    "objectID": "docs/api/utils.callbacks.mlflow_.html#classes",
-    "href": "docs/api/utils.callbacks.mlflow_.html#classes",
-    "title": "utils.callbacks.mlflow_",
+    "objectID": "docs/api/prompt_strategies.dpo.chat_template.html#functions",
+    "href": "docs/api/prompt_strategies.dpo.chat_template.html#functions",
+    "title": "prompt_strategies.dpo.chat_template",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nDPO chat template strategy for argilla-style datasets.\n\n\n\n\n\nprompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)\nDPO chat template strategy for argilla-style datasets.\nFor argilla-style datasets where chosen/rejected contain full conversations\ninstead of single response messages. Extracts the conversation history from\nthe chosen field and formats both chosen/rejected responses using the\nconfigured chat template.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nConfiguration object containing chat_template and dataset settings\nrequired\n\n\ndataset_idx\n\nIndex of the dataset in the config (default: 0)\n0\n\n\n**kwargs\n\nAdditional keyword arguments (unused)\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ntuple\n\n(transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with ‘remove_columns’ specifying columns to drop\n\n\n\n\n\n\n{\n“chosen”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n],\n“rejected”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n]\n}"
   },
   {
-    "objectID": "docs/api/common.datasets.html",
-    "href": "docs/api/common.datasets.html",
-    "title": "common.datasets",
+    "objectID": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
+    "title": "monkeypatch.btlm_attn_hijack_flash",
     "section": "",
-    "text": "common.datasets\nDataset loading utilities.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
+    "text": "monkeypatch.btlm_attn_hijack_flash\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model"
   },
   {
-    "objectID": "docs/api/common.datasets.html#classes",
-    "href": "docs/api/common.datasets.html#classes",
-    "title": "common.datasets",
+    "objectID": "docs/api/utils.schemas.enums.html",
+    "href": "docs/api/utils.schemas.enums.html",
+    "title": "utils.schemas.enums",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata."
+    "text": "utils.schemas.enums\nEnums for Axolotl input config\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations"
   },
   {
-    "objectID": "docs/api/common.datasets.html#functions",
-    "href": "docs/api/common.datasets.html#functions",
-    "title": "common.datasets",
+    "objectID": "docs/api/utils.schemas.enums.html#classes",
+    "href": "docs/api/utils.schemas.enums.html#classes",
+    "title": "utils.schemas.enums",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
+    "text": "Name\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations"
   },
   {
-    "objectID": "docs/api/utils.schemas.datasets.html",
-    "href": "docs/api/utils.schemas.datasets.html",
-    "title": "utils.schemas.datasets",
+    "objectID": "docs/api/utils.schemas.config.html",
+    "href": "docs/api/utils.schemas.config.html",
+    "title": "utils.schemas.config",
     "section": "",
-    "text": "utils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nDPODataset\nDPO configuration subset\n\n\nKTODataset\nKTO configuration subset\n\n\nPretrainingDataset\nPretraining dataset configuration subset\n\n\nSFTDataset\nSFT configuration subset\n\n\nStepwiseSupervisedDataset\nStepwise supervised dataset configuration subset\n\n\nUserDefinedDPOType\nUser defined typing for DPO\n\n\nUserDefinedKTOType\nUser defined typing for KTO\n\n\nUserDefinedPrompterType\nStructure for user defined prompt types\n\n\n\n\n\nutils.schemas.datasets.DPODataset()\nDPO configuration subset\n\n\n\nutils.schemas.datasets.KTODataset()\nKTO configuration subset\n\n\n\nutils.schemas.datasets.PretrainingDataset()\nPretraining dataset configuration subset\n\n\n\nutils.schemas.datasets.SFTDataset()\nSFT configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.SFTDataset.handle_legacy_message_fields(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.StepwiseSupervisedDataset()\nStepwise supervised dataset configuration subset\n\n\n\nutils.schemas.datasets.UserDefinedDPOType()\nUser defined typing for DPO\n\n\n\nutils.schemas.datasets.UserDefinedKTOType()\nUser defined typing for KTO\n\n\n\nutils.schemas.datasets.UserDefinedPrompterType()\nStructure for user defined prompt types"
+    "text": "utils.schemas.config\nModule with Pydantic models for configuration.\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nWrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nWrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options."
   },
   {
-    "objectID": "docs/api/utils.schemas.datasets.html#classes",
-    "href": "docs/api/utils.schemas.datasets.html#classes",
-    "title": "utils.schemas.datasets",
+    "objectID": "docs/api/utils.schemas.config.html#classes",
+    "href": "docs/api/utils.schemas.config.html#classes",
+    "title": "utils.schemas.config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nDPODataset\nDPO configuration subset\n\n\nKTODataset\nKTO configuration subset\n\n\nPretrainingDataset\nPretraining dataset configuration subset\n\n\nSFTDataset\nSFT configuration subset\n\n\nStepwiseSupervisedDataset\nStepwise supervised dataset configuration subset\n\n\nUserDefinedDPOType\nUser defined typing for DPO\n\n\nUserDefinedKTOType\nUser defined typing for KTO\n\n\nUserDefinedPrompterType\nStructure for user defined prompt types\n\n\n\n\n\nutils.schemas.datasets.DPODataset()\nDPO configuration subset\n\n\n\nutils.schemas.datasets.KTODataset()\nKTO configuration subset\n\n\n\nutils.schemas.datasets.PretrainingDataset()\nPretraining dataset configuration subset\n\n\n\nutils.schemas.datasets.SFTDataset()\nSFT configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.SFTDataset.handle_legacy_message_fields(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.StepwiseSupervisedDataset()\nStepwise supervised dataset configuration subset\n\n\n\nutils.schemas.datasets.UserDefinedDPOType()\nUser defined typing for DPO\n\n\n\nutils.schemas.datasets.UserDefinedKTOType()\nUser defined typing for KTO\n\n\n\nutils.schemas.datasets.UserDefinedPrompterType()\nStructure for user defined prompt types"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nWrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nWrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options."
   },
   {
-    "objectID": "docs/api/cli.utils.fetch.html",
-    "href": "docs/api/cli.utils.fetch.html",
-    "title": "cli.utils.fetch",
+    "objectID": "docs/api/prompt_strategies.dpo.passthrough.html",
+    "href": "docs/api/prompt_strategies.dpo.passthrough.html",
+    "title": "prompt_strategies.dpo.passthrough",
     "section": "",
-    "text": "cli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
+    "text": "prompt_strategies.dpo.passthrough\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy"
   },
   {
-    "objectID": "docs/api/cli.utils.fetch.html#functions",
-    "href": "docs/api/cli.utils.fetch.html#functions",
-    "title": "cli.utils.fetch",
+    "objectID": "docs/api/convert.html",
+    "href": "docs/api/convert.html",
+    "title": "convert",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
+    "text": "convert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nFileReader\nReads a file and returns its contents as a string\n\n\nFileWriter\nWrites a string to a file\n\n\nJsonParser\nParses a string as JSON and returns the result\n\n\nJsonToJsonlConverter\nConverts a JSON file to JSONL\n\n\nJsonlSerializer\nSerializes a list of JSON objects into a JSONL string\n\n\nStdoutWriter\nWrites a string to stdout\n\n\n\n\n\nconvert.FileReader()\nReads a file and returns its contents as a string\n\n\n\nconvert.FileWriter(file_path)\nWrites a string to a file\n\n\n\nconvert.JsonParser()\nParses a string as JSON and returns the result\n\n\n\nconvert.JsonToJsonlConverter(\n    file_reader,\n    file_writer,\n    json_parser,\n    jsonl_serializer,\n)\nConverts a JSON file to JSONL\n\n\n\nconvert.JsonlSerializer()\nSerializes a list of JSON objects into a JSONL string\n\n\n\nconvert.StdoutWriter()\nWrites a string to stdout"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.chatml.html",
-    "href": "docs/api/prompt_strategies.dpo.chatml.html",
-    "title": "prompt_strategies.dpo.chatml",
+    "objectID": "docs/api/convert.html#classes",
+    "href": "docs/api/convert.html#classes",
+    "title": "convert",
     "section": "",
-    "text": "prompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "Name\nDescription\n\n\n\n\nFileReader\nReads a file and returns its contents as a string\n\n\nFileWriter\nWrites a string to a file\n\n\nJsonParser\nParses a string as JSON and returns the result\n\n\nJsonToJsonlConverter\nConverts a JSON file to JSONL\n\n\nJsonlSerializer\nSerializes a list of JSON objects into a JSONL string\n\n\nStdoutWriter\nWrites a string to stdout\n\n\n\n\n\nconvert.FileReader()\nReads a file and returns its contents as a string\n\n\n\nconvert.FileWriter(file_path)\nWrites a string to a file\n\n\n\nconvert.JsonParser()\nParses a string as JSON and returns the result\n\n\n\nconvert.JsonToJsonlConverter(\n    file_reader,\n    file_writer,\n    json_parser,\n    jsonl_serializer,\n)\nConverts a JSON file to JSONL\n\n\n\nconvert.JsonlSerializer()\nSerializes a list of JSON objects into a JSONL string\n\n\n\nconvert.StdoutWriter()\nWrites a string to stdout"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.chatml.html#functions",
-    "href": "docs/api/prompt_strategies.dpo.chatml.html#functions",
-    "title": "prompt_strategies.dpo.chatml",
+    "objectID": "docs/api/utils.schemas.integrations.html",
+    "href": "docs/api/utils.schemas.integrations.html",
+    "title": "utils.schemas.integrations",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "utils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\n\n\n\nName\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nOpenTelemetryConfig\nOpenTelemetry configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nTrackioConfig\nTrackio configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.OpenTelemetryConfig()\nOpenTelemetry configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.TrackioConfig()\nTrackio configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
   },
   {
-    "objectID": "docs/api/monkeypatch.relora.html",
-    "href": "docs/api/monkeypatch.relora.html",
-    "title": "monkeypatch.relora",
+    "objectID": "docs/api/utils.schemas.integrations.html#classes",
+    "href": "docs/api/utils.schemas.integrations.html#classes",
+    "title": "utils.schemas.integrations",
     "section": "",
-    "text": "monkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\n\n\n\nName\nDescription\n\n\n\n\nReLoRACallback\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\n\n\n\nmonkeypatch.relora.ReLoRACallback(cfg)\nCallback to merge LoRA weights into the base model and save full-weight checkpoints"
+    "text": "Name\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nOpenTelemetryConfig\nOpenTelemetry configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nTrackioConfig\nTrackio configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.OpenTelemetryConfig()\nOpenTelemetry configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.TrackioConfig()\nTrackio configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
   },
   {
-    "objectID": "docs/api/monkeypatch.relora.html#classes",
-    "href": "docs/api/monkeypatch.relora.html#classes",
-    "title": "monkeypatch.relora",
+    "objectID": "docs/api/prompt_strategies.kto.llama3.html",
+    "href": "docs/api/prompt_strategies.kto.llama3.html",
+    "title": "prompt_strategies.kto.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nReLoRACallback\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\n\n\n\nmonkeypatch.relora.ReLoRACallback(cfg)\nCallback to merge LoRA weights into the base model and save full-weight checkpoints"
+    "text": "prompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/cli.evaluate.html",
-    "href": "docs/api/cli.evaluate.html",
-    "title": "cli.evaluate",
+    "objectID": "docs/api/prompt_strategies.kto.llama3.html#functions",
+    "href": "docs/api/prompt_strategies.kto.llama3.html#functions",
+    "title": "prompt_strategies.kto.llama3",
     "section": "",
-    "text": "cli.evaluate\nCLI to run evaluation on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\ndo_evaluate\nEvaluates a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.evaluate.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.evaluate.do_evaluate(cfg, cli_args)\nEvaluates a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.evaluate.evaluate, which computes\nevaluation metrics on the given dataset(s) and writes them to disk.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nCLI arguments.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/cli.evaluate.html#functions",
-    "href": "docs/api/cli.evaluate.html#functions",
-    "title": "cli.evaluate",
+    "objectID": "docs/api/prompt_strategies.dpo.user_defined.html",
+    "href": "docs/api/prompt_strategies.dpo.user_defined.html",
+    "title": "prompt_strategies.dpo.user_defined",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\ndo_evaluate\nEvaluates a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.evaluate.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.evaluate.do_evaluate(cfg, cli_args)\nEvaluates a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.evaluate.evaluate, which computes\nevaluation metrics on the given dataset(s) and writes them to disk.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nCLI arguments.\nrequired"
+    "text": "prompt_strategies.dpo.user_defined\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.zephyr.html",
-    "href": "docs/api/prompt_strategies.dpo.zephyr.html",
-    "title": "prompt_strategies.dpo.zephyr",
+    "objectID": "docs/api/core.trainers.mixins.scheduler.html",
+    "href": "docs/api/core.trainers.mixins.scheduler.html",
+    "title": "core.trainers.mixins.scheduler",
     "section": "",
-    "text": "prompt_strategies.dpo.zephyr\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr"
+    "text": "core.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
   },
   {
-    "objectID": "docs/api/core.trainers.utils.html",
-    "href": "docs/api/core.trainers.utils.html",
-    "title": "core.trainers.utils",
+    "objectID": "docs/api/core.trainers.mixins.scheduler.html#classes",
+    "href": "docs/api/core.trainers.mixins.scheduler.html#classes",
+    "title": "core.trainers.mixins.scheduler",
     "section": "",
-    "text": "core.trainers.utils\ncore.trainers.utils\nUtils for Axolotl trainers"
+    "text": "Name\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html",
-    "href": "docs/api/prompt_strategies.alpaca_w_system.html",
-    "title": "prompt_strategies.alpaca_w_system",
+    "objectID": "docs/api/core.chat.messages.html",
+    "href": "docs/api/core.chat.messages.html",
+    "title": "core.chat.messages",
     "section": "",
-    "text": "prompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\n\n\n\nName\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
+    "text": "core.chat.messages\ninternal message representations of chat messages\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
-    "href": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
-    "title": "prompt_strategies.alpaca_w_system",
+    "objectID": "docs/api/core.chat.messages.html#classes",
+    "href": "docs/api/core.chat.messages.html#classes",
+    "title": "core.chat.messages",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
+    "text": "Name\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
   },
   {
-    "objectID": "docs/api/utils.chat_templates.html",
-    "href": "docs/api/utils.chat_templates.html",
-    "title": "utils.chat_templates",
+    "objectID": "docs/api/prompt_strategies.user_defined.html",
+    "href": "docs/api/prompt_strategies.user_defined.html",
+    "title": "prompt_strategies.user_defined",
     "section": "",
-    "text": "utils.chat_templates\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\nThese templates are used for formatting messages in a conversation."
+    "text": "prompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\n\n\n\nName\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
   },
   {
-    "objectID": "docs/api/utils.data.streaming.html",
-    "href": "docs/api/utils.data.streaming.html",
-    "title": "utils.data.streaming",
+    "objectID": "docs/api/prompt_strategies.user_defined.html#classes",
+    "href": "docs/api/prompt_strategies.user_defined.html#classes",
+    "title": "prompt_strategies.user_defined",
     "section": "",
-    "text": "utils.data.streaming\nutils.data.streaming\nData handling specific to streaming datasets."
+    "text": "Name\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
   },
   {
-    "objectID": "docs/api/utils.bench.html",
-    "href": "docs/api/utils.bench.html",
-    "title": "utils.bench",
+    "objectID": "docs/api/prompt_strategies.messages.chat.html",
+    "href": "docs/api/prompt_strategies.messages.chat.html",
+    "title": "prompt_strategies.messages.chat",
     "section": "",
-    "text": "utils.bench\nBenchmarking and measurement utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
+    "text": "prompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
   },
   {
-    "objectID": "docs/api/utils.bench.html#functions",
-    "href": "docs/api/utils.bench.html#functions",
-    "title": "utils.bench",
+    "objectID": "docs/api/prompt_strategies.messages.chat.html#classes",
+    "href": "docs/api/prompt_strategies.messages.chat.html#classes",
+    "title": "prompt_strategies.messages.chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
+    "text": "Name\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
   },
   {
-    "objectID": "docs/api/common.architectures.html",
-    "href": "docs/api/common.architectures.html",
-    "title": "common.architectures",
+    "objectID": "docs/api/monkeypatch.lora_kernels.html",
+    "href": "docs/api/monkeypatch.lora_kernels.html",
+    "title": "monkeypatch.lora_kernels",
     "section": "",
-    "text": "common.architectures\ncommon.architectures\nCommon architecture specific constants"
+    "text": "monkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\n\n\n\nName\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
   },
   {
-    "objectID": "docs/api/cli.checks.html",
-    "href": "docs/api/cli.checks.html",
-    "title": "cli.checks",
+    "objectID": "docs/api/monkeypatch.lora_kernels.html#classes",
+    "href": "docs/api/monkeypatch.lora_kernels.html#classes",
+    "title": "monkeypatch.lora_kernels",
     "section": "",
-    "text": "cli.checks\nVarious checks for Axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
+    "text": "Name\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching"
   },
   {
-    "objectID": "docs/api/cli.checks.html#functions",
-    "href": "docs/api/cli.checks.html#functions",
-    "title": "cli.checks",
+    "objectID": "docs/api/monkeypatch.lora_kernels.html#functions",
+    "href": "docs/api/monkeypatch.lora_kernels.html#functions",
+    "title": "monkeypatch.lora_kernels",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
+    "text": "Name\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
   },
   {
-    "objectID": "docs/api/core.trainers.dpo.trainer.html",
-    "href": "docs/api/core.trainers.dpo.trainer.html",
-    "title": "core.trainers.dpo.trainer",
+    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html",
+    "href": "docs/api/prompt_strategies.stepwise_supervised.html",
+    "title": "prompt_strategies.stepwise_supervised",
     "section": "",
-    "text": "core.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlDPOTrainer\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer(*args, dataset_tags=None, **kwargs)\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\nName\nDescription\n\n\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing\nthe model on the Hub. Please refer to ~transformers.Trainer.push_to_hub\nfor more details."
+    "text": "prompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\nand (optionally) per-step, or per-prompt-trace labels for reward modelling.\n\n\n\n\n\nName\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
   },
   {
-    "objectID": "docs/api/core.trainers.dpo.trainer.html#classes",
-    "href": "docs/api/core.trainers.dpo.trainer.html#classes",
-    "title": "core.trainers.dpo.trainer",
+    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
+    "href": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
+    "title": "prompt_strategies.stepwise_supervised",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlDPOTrainer\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer(*args, dataset_tags=None, **kwargs)\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\nName\nDescription\n\n\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing\nthe model on the Hub. Please refer to ~transformers.Trainer.push_to_hub\nfor more details."
+    "text": "Name\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
   },
   {
-    "objectID": "docs/api/integrations.base.html",
-    "href": "docs/api/integrations.base.html",
-    "title": "integrations.base",
+    "objectID": "docs/api/prompt_strategies.completion.html",
+    "href": "docs/api/prompt_strategies.completion.html",
+    "title": "prompt_strategies.completion",
     "section": "",
-    "text": "integrations.base\nBase class for all plugins.\nA plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.\nPlugins can be used to integrate third-party models, modify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntype[Trainer] | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported.\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
+    "text": "prompt_strategies.completion\nBasic completion text\n\n\n\n\n\nName\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
   },
   {
-    "objectID": "docs/api/integrations.base.html#classes",
-    "href": "docs/api/integrations.base.html#classes",
-    "title": "integrations.base",
+    "objectID": "docs/api/prompt_strategies.completion.html#classes",
+    "href": "docs/api/prompt_strategies.completion.html#classes",
+    "title": "prompt_strategies.completion",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntype[Trainer] | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
+    "text": "Name\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
   },
   {
-    "objectID": "docs/api/integrations.base.html#functions",
-    "href": "docs/api/integrations.base.html#functions",
-    "title": "integrations.base",
+    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html",
+    "href": "docs/api/core.trainers.mixins.rng_state_loader.html",
+    "title": "core.trainers.mixins.rng_state_loader",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
+    "text": "core.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\nSee https://github.com/huggingface/transformers/pull/37162\nTODO: Remove when upstream added PR to release\n\n\n\n\n\nName\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
   },
   {
-    "objectID": "docs/api/cli.utils.train.html",
-    "href": "docs/api/cli.utils.train.html",
-    "title": "cli.utils.train",
+    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
+    "href": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
+    "title": "core.trainers.mixins.rng_state_loader",
     "section": "",
-    "text": "cli.utils.train\nUtilities for axolotl train CLI command.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ngenerate_config_files\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\n\n\nlaunch_training\nExecute training with the given configuration.\n\n\n\n\n\ncli.utils.train.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.train.generate_config_files(config, sweep)\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\nwhether this is a group of configurations (i.e., a sweep).\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nBase configuration file\nrequired\n\n\nsweep\nstr | None\nSweep configuration file\nrequired\n\n\n\n\n\n\n\ncli.utils.train.launch_training(\n    cfg_file,\n    launcher,\n    cloud,\n    kwargs,\n    launcher_args=None,\n    use_exec=False,\n)\nExecute training with the given configuration."
+    "text": "Name\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
   },
   {
-    "objectID": "docs/api/cli.utils.train.html#functions",
-    "href": "docs/api/cli.utils.train.html#functions",
-    "title": "cli.utils.train",
+    "objectID": "docs/api/cli.train.html",
+    "href": "docs/api/cli.train.html",
+    "title": "cli.train",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ngenerate_config_files\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\n\n\nlaunch_training\nExecute training with the given configuration.\n\n\n\n\n\ncli.utils.train.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.train.generate_config_files(config, sweep)\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\nwhether this is a group of configurations (i.e., a sweep).\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nBase configuration file\nrequired\n\n\nsweep\nstr | None\nSweep configuration file\nrequired\n\n\n\n\n\n\n\ncli.utils.train.launch_training(\n    cfg_file,\n    launcher,\n    cloud,\n    kwargs,\n    launcher_args=None,\n    use_exec=False,\n)\nExecute training with the given configuration."
+    "text": "cli.train\nCLI to run training on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.collators.mamba.html",
-    "href": "docs/api/utils.collators.mamba.html",
-    "title": "utils.collators.mamba",
+    "objectID": "docs/api/cli.train.html#functions",
+    "href": "docs/api/cli.train.html#functions",
+    "title": "cli.train",
     "section": "",
-    "text": "utils.collators.mamba\ncollators for Mamba\n\n\n\n\n\nName\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.collators.mamba.html#classes",
-    "href": "docs/api/utils.collators.mamba.html#classes",
-    "title": "utils.collators.mamba",
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html",
+    "title": "cli.merge_sharded_fsdp_weights",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
+    "text": "cli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
   },
   {
-    "objectID": "docs/api/cli.art.html",
-    "href": "docs/api/cli.art.html",
-    "title": "cli.art",
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
+    "title": "cli.merge_sharded_fsdp_weights",
     "section": "",
-    "text": "cli.art\nAxolotl ASCII logo utils.\n\n\n\n\n\nName\nDescription\n\n\n\n\nprint_axolotl_text_art\nPrints axolotl ASCII art.\n\n\n\n\n\ncli.art.print_axolotl_text_art()\nPrints axolotl ASCII art."
+    "text": "Name\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading."
   },
   {
-    "objectID": "docs/api/cli.art.html#functions",
-    "href": "docs/api/cli.art.html#functions",
-    "title": "cli.art",
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
+    "title": "cli.merge_sharded_fsdp_weights",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nprint_axolotl_text_art\nPrints axolotl ASCII art.\n\n\n\n\n\ncli.art.print_axolotl_text_art()\nPrints axolotl ASCII art."
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
   },
   {
-    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html",
-    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html",
-    "title": "monkeypatch.trainer_fsdp_optim",
+    "objectID": "docs/api/loaders.adapter.html",
+    "href": "docs/api/loaders.adapter.html",
+    "title": "loaders.adapter",
     "section": "",
-    "text": "monkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
+    "text": "loaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
   },
   {
-    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
-    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
-    "title": "monkeypatch.trainer_fsdp_optim",
+    "objectID": "docs/api/loaders.adapter.html#functions",
+    "href": "docs/api/loaders.adapter.html#functions",
+    "title": "loaders.adapter",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
+    "text": "Name\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
   },
   {
-    "objectID": "docs/api/logging_config.html",
-    "href": "docs/api/logging_config.html",
-    "title": "logging_config",
+    "objectID": "docs/api/core.training_args.html",
+    "href": "docs/api/core.training_args.html",
+    "title": "core.training_args",
     "section": "",
-    "text": "logging_config\nCommon logging module for axolotl.\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlLogger\nLogger that applies filtering to non-axolotl loggers.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nLogger that applies filtering to non-axolotl loggers.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\nINFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records\n(i.e. non-axolotl.INFO, DEBUG, etc. by default).\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
+    "text": "core.training_args\nextra axolotl specific training args\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(simpo_gamma=None)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig()\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig()\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig()\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig()\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments()\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin."
   },
   {
-    "objectID": "docs/api/logging_config.html#classes",
-    "href": "docs/api/logging_config.html#classes",
-    "title": "logging_config",
+    "objectID": "docs/api/core.training_args.html#classes",
+    "href": "docs/api/core.training_args.html#classes",
+    "title": "core.training_args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlLogger\nLogger that applies filtering to non-axolotl loggers.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nLogger that applies filtering to non-axolotl loggers.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\nINFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records\n(i.e. non-axolotl.INFO, DEBUG, etc. by default).\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(simpo_gamma=None)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig()\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig()\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig()\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig()\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments()\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin."
   },
   {
-    "objectID": "docs/api/logging_config.html#functions",
-    "href": "docs/api/logging_config.html#functions",
-    "title": "logging_config",
+    "objectID": "docs/api/utils.callbacks.lisa.html",
+    "href": "docs/api/utils.callbacks.lisa.html",
+    "title": "utils.callbacks.lisa",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
+    "text": "utils.callbacks.lisa\nutils.callbacks.lisa\nmodule for LISA\nAdapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl\nArxiv: https://arxiv.org/abs/2403.17919\nLicense: Apache 2.0"
   },
   {
-    "objectID": "docs/api/utils.freeze.html",
-    "href": "docs/api/utils.freeze.html",
-    "title": "utils.freeze",
+    "objectID": "docs/api/loaders.processor.html",
+    "href": "docs/api/loaders.processor.html",
+    "title": "loaders.processor",
     "section": "",
-    "text": "utils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\n\n\n\nName\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
+    "text": "loaders.processor\nloaders.processor\nProcessor loading functionality for multi-modal models"
   },
   {
-    "objectID": "docs/api/utils.freeze.html#classes",
-    "href": "docs/api/utils.freeze.html#classes",
-    "title": "utils.freeze",
+    "objectID": "docs/api/core.builders.rl.html",
+    "href": "docs/api/core.builders.rl.html",
+    "title": "core.builders.rl",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise."
+    "text": "core.builders.rl\nBuilder for RLHF trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
   },
   {
-    "objectID": "docs/api/utils.freeze.html#functions",
-    "href": "docs/api/utils.freeze.html#functions",
-    "title": "utils.freeze",
+    "objectID": "docs/api/core.builders.rl.html#classes",
+    "href": "docs/api/core.builders.rl.html#classes",
+    "title": "core.builders.rl",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
+    "text": "Name\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
   },
   {
-    "objectID": "docs/api/prompt_strategies.metharme.html",
-    "href": "docs/api/prompt_strategies.metharme.html",
-    "title": "prompt_strategies.metharme",
+    "objectID": "docs/api/monkeypatch.utils.html",
+    "href": "docs/api/monkeypatch.utils.html",
+    "title": "monkeypatch.utils",
     "section": "",
-    "text": "prompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nMetharmePromptTokenizingStrategy\nTokenizing strategy for the Metharme models\n\n\nMetharmePrompter\nPrompter for the Metharme models.\n\n\n\n\n\nprompt_strategies.metharme.MetharmePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for the Metharme models\n\n\n\nprompt_strategies.metharme.MetharmePrompter(*args, **kwargs)\nPrompter for the Metharme models."
+    "text": "monkeypatch.utils\nShared utils for the monkeypatches\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\nmask_2d_to_4d\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\nmonkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\nThis expansion handles packed sequences so that sequences share the same attention mask integer value\nwhen they attend to each other within that sequence.\nThis expansion transforms the mask to lower triangular form to prevent future peeking."
   },
   {
-    "objectID": "docs/api/prompt_strategies.metharme.html#classes",
-    "href": "docs/api/prompt_strategies.metharme.html#classes",
-    "title": "prompt_strategies.metharme",
+    "objectID": "docs/api/monkeypatch.utils.html#functions",
+    "href": "docs/api/monkeypatch.utils.html#functions",
+    "title": "monkeypatch.utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMetharmePromptTokenizingStrategy\nTokenizing strategy for the Metharme models\n\n\nMetharmePrompter\nPrompter for the Metharme models.\n\n\n\n\n\nprompt_strategies.metharme.MetharmePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for the Metharme models\n\n\n\nprompt_strategies.metharme.MetharmePrompter(*args, **kwargs)\nPrompter for the Metharme models."
+    "text": "Name\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\nmask_2d_to_4d\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\nmonkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\nThis expansion handles packed sequences so that sequences share the same attention mask integer value\nwhen they attend to each other within that sequence.\nThis expansion transforms the mask to lower triangular form to prevent future peeking."
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_chat.html",
-    "href": "docs/api/prompt_strategies.alpaca_chat.html",
-    "title": "prompt_strategies.alpaca_chat",
+    "objectID": "docs/api/prompt_strategies.kto.user_defined.html",
+    "href": "docs/api/prompt_strategies.kto.user_defined.html",
+    "title": "prompt_strategies.kto.user_defined",
     "section": "",
-    "text": "prompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
+    "text": "prompt_strategies.kto.user_defined\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_chat.html#classes",
-    "href": "docs/api/prompt_strategies.alpaca_chat.html#classes",
-    "title": "prompt_strategies.alpaca_chat",
+    "objectID": "docs/api/integrations.cut_cross_entropy.args.html",
+    "href": "docs/api/integrations.cut_cross_entropy.args.html",
+    "title": "integrations.cut_cross_entropy.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
+    "text": "integrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
   },
   {
-    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
-    "title": "monkeypatch.stablelm_attn_hijack_flash",
+    "objectID": "docs/api/integrations.cut_cross_entropy.args.html#classes",
+    "href": "docs/api/integrations.cut_cross_entropy.args.html#classes",
+    "title": "integrations.cut_cross_entropy.args",
     "section": "",
-    "text": "monkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\n\n\n\nName\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
+    "text": "Name\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
   },
   {
-    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
-    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
-    "title": "monkeypatch.stablelm_attn_hijack_flash",
+    "objectID": "docs/api/kernels.swiglu.html",
+    "href": "docs/api/kernels.swiglu.html",
+    "title": "kernels.swiglu",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
+    "text": "kernels.swiglu\nModule for definition of SwiGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/models.mamba.modeling_mamba.html",
-    "href": "docs/api/models.mamba.modeling_mamba.html",
-    "title": "models.mamba.modeling_mamba",
+    "objectID": "docs/api/kernels.swiglu.html#functions",
+    "href": "docs/api/kernels.swiglu.html#functions",
+    "title": "kernels.swiglu",
     "section": "",
-    "text": "models.mamba.modeling_mamba\nmodels.mamba.modeling_mamba"
+    "text": "Name\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/core.trainers.trl.html",
-    "href": "docs/api/core.trainers.trl.html",
-    "title": "core.trainers.trl",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
+    "title": "monkeypatch.gradient_checkpointing.offload_disk",
     "section": "",
-    "text": "core.trainers.trl\nModule for TRL RL trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
+    "text": "monkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\nName\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
   },
   {
-    "objectID": "docs/api/core.trainers.trl.html#classes",
-    "href": "docs/api/core.trainers.trl.html#classes",
-    "title": "core.trainers.trl",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
+    "title": "monkeypatch.gradient_checkpointing.offload_disk",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
+    "text": "Name\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
   },
   {
-    "objectID": "docs/api/prompt_strategies.input_output.html",
-    "href": "docs/api/prompt_strategies.input_output.html",
-    "title": "prompt_strategies.input_output",
+    "objectID": "docs/api/cli.cloud.base.html",
+    "href": "docs/api/cli.cloud.base.html",
+    "title": "cli.cloud.base",
     "section": "",
-    "text": "prompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\n\n\n\nName\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
+    "text": "cli.cloud.base\nbase class for cloud platforms from cli\n\n\n\n\n\nName\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
   },
   {
-    "objectID": "docs/api/prompt_strategies.input_output.html#classes",
-    "href": "docs/api/prompt_strategies.input_output.html#classes",
-    "title": "prompt_strategies.input_output",
+    "objectID": "docs/api/cli.cloud.base.html#classes",
+    "href": "docs/api/cli.cloud.base.html#classes",
+    "title": "cli.cloud.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
+    "text": "Name\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
   },
   {
-    "objectID": "docs/api/loaders.constants.html",
-    "href": "docs/api/loaders.constants.html",
-    "title": "loaders.constants",
+    "objectID": "docs/api/prompt_strategies.alpaca_instruct.html",
+    "href": "docs/api/prompt_strategies.alpaca_instruct.html",
+    "title": "prompt_strategies.alpaca_instruct",
     "section": "",
-    "text": "loaders.constants\nloaders.constants\nShared constants for axolotl.loaders module"
+    "text": "prompt_strategies.alpaca_instruct\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class"
   },
   {
-    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
-    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
-    "title": "monkeypatch.data.batch_dataset_fetcher",
+    "objectID": "docs/api/prompt_strategies.pygmalion.html",
+    "href": "docs/api/prompt_strategies.pygmalion.html",
+    "title": "prompt_strategies.pygmalion",
     "section": "",
-    "text": "monkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
+    "text": "prompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
   },
   {
-    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
-    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
-    "title": "monkeypatch.data.batch_dataset_fetcher",
+    "objectID": "docs/api/prompt_strategies.pygmalion.html#classes",
+    "href": "docs/api/prompt_strategies.pygmalion.html#classes",
+    "title": "prompt_strategies.pygmalion",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
+    "text": "Name\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
   },
   {
-    "objectID": "docs/api/cli.vllm_serve.html",
-    "href": "docs/api/cli.vllm_serve.html",
-    "title": "cli.vllm_serve",
+    "objectID": "docs/api/utils.schemas.peft.html",
+    "href": "docs/api/utils.schemas.peft.html",
+    "title": "utils.schemas.peft",
     "section": "",
-    "text": "cli.vllm_serve\nCLI to start the vllm server for online RL\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
+    "text": "utils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
   },
   {
-    "objectID": "docs/api/cli.vllm_serve.html#classes",
-    "href": "docs/api/cli.vllm_serve.html#classes",
-    "title": "cli.vllm_serve",
+    "objectID": "docs/api/utils.schemas.peft.html#classes",
+    "href": "docs/api/utils.schemas.peft.html#classes",
+    "title": "utils.schemas.peft",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server"
+    "text": "Name\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
   },
   {
-    "objectID": "docs/api/cli.vllm_serve.html#functions",
-    "href": "docs/api/cli.vllm_serve.html#functions",
-    "title": "cli.vllm_serve",
+    "objectID": "docs/api/integrations.liger.args.html",
+    "href": "docs/api/integrations.liger.args.html",
+    "title": "integrations.liger.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
+    "text": "integrations.liger.args\nModule for handling LIGER input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
   },
   {
-    "objectID": "docs/api/prompt_tokenizers.html",
-    "href": "docs/api/prompt_tokenizers.html",
-    "title": "prompt_tokenizers",
+    "objectID": "docs/api/integrations.liger.args.html#classes",
+    "href": "docs/api/integrations.liger.args.html#classes",
+    "title": "integrations.liger.args",
     "section": "",
-    "text": "prompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
+    "text": "Name\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
   },
   {
-    "objectID": "docs/api/prompt_tokenizers.html#classes",
-    "href": "docs/api/prompt_tokenizers.html#classes",
-    "title": "prompt_tokenizers",
+    "objectID": "docs/api/loaders.patch_manager.html",
+    "href": "docs/api/loaders.patch_manager.html",
+    "title": "loaders.patch_manager",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts."
+    "text": "loaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\nApplies pre- and post-model load patches for various fixes and optimizations.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_config_load_patches\nApply patches that must be set up before config loading.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\napply_pre_tokenizer_load_patches\nApply patches that must be set up before tokenizer loading.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_config_load_patches(cfg)\nApply patches that must be set up before config loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoConfig.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired\n\n\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches(cfg)\nApply patches that must be set up before tokenizer loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoTokenizer.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired"
   },
   {
-    "objectID": "docs/api/prompt_tokenizers.html#functions",
-    "href": "docs/api/prompt_tokenizers.html#functions",
-    "title": "prompt_tokenizers",
+    "objectID": "docs/api/loaders.patch_manager.html#classes",
+    "href": "docs/api/loaders.patch_manager.html#classes",
+    "title": "loaders.patch_manager",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
+    "text": "Name\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_config_load_patches\nApply patches that must be set up before config loading.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\napply_pre_tokenizer_load_patches\nApply patches that must be set up before tokenizer loading.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_config_load_patches(cfg)\nApply patches that must be set up before config loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoConfig.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired\n\n\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches(cfg)\nApply patches that must be set up before tokenizer loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoTokenizer.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired"
   },
   {
-    "objectID": "docs/api/cli.args.html",
-    "href": "docs/api/cli.args.html",
-    "title": "cli.args",
+    "objectID": "docs/api/monkeypatch.transformers_fa_utils.html",
+    "href": "docs/api/monkeypatch.transformers_fa_utils.html",
+    "title": "monkeypatch.transformers_fa_utils",
     "section": "",
-    "text": "cli.args\nModule for axolotl CLI command arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nQuantizeCliArgs\nDataclass with CLI arguments for axolotl quantize command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=False,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.QuantizeCliArgs(\n    base_model=None,\n    weight_dtype=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n    group_size=None,\n    output_dir=None,\n    hub_model_id=None,\n)\nDataclass with CLI arguments for axolotl quantize command.\n\n\n\ncli.args.TrainerCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    prompter=None,\n    shard=False,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    tensor_parallel_size=None,\n    data_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n    serve_module=None,\n    enable_reasoning=None,\n    reasoning_parser=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
+    "text": "monkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\n\n\n\nName\nDescription\n\n\n\n\nfixed_fa_peft_integration_check\nPEFT usually casts the layer norms in float32 for training stability reasons\n\n\n\n\n\nmonkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(\n    query,\n    key,\n    value,\n    target_dtype=None,\n    preferred_dtype=None,\n)\nPEFT usually casts the layer norms in float32 for training stability reasons\ntherefore the input hidden states gets silently casted in float32. Hence, we need\ncast them back in float16 / bfloat16 just to be sure everything works as expected.\nThis might slowdown training & inference so it is recommended to not cast the LayerNorms!\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nquery\ntorch.Tensor\nInput query states to be passed to Flash Attention API\nrequired\n\n\nkey\ntorch.Tensor\nInput key states to be passed to Flash Attention API\nrequired\n\n\nvalue\ntorch.Tensor\nInput value states to be passed to Flash Attention API\nrequired\n\n\ntarget_dtype\ntorch.dtype, optional\nThe dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype.\nNone\n\n\npreferred_dtype\ntorch.dtype, optional\nThe preferred dtype to convert the attention tensors to regardless of the target dtype.\nNone"
   },
   {
-    "objectID": "docs/api/cli.args.html#classes",
-    "href": "docs/api/cli.args.html#classes",
-    "title": "cli.args",
+    "objectID": "docs/api/monkeypatch.transformers_fa_utils.html#functions",
+    "href": "docs/api/monkeypatch.transformers_fa_utils.html#functions",
+    "title": "monkeypatch.transformers_fa_utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nQuantizeCliArgs\nDataclass with CLI arguments for axolotl quantize command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=False,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.QuantizeCliArgs(\n    base_model=None,\n    weight_dtype=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n    group_size=None,\n    output_dir=None,\n    hub_model_id=None,\n)\nDataclass with CLI arguments for axolotl quantize command.\n\n\n\ncli.args.TrainerCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    prompter=None,\n    shard=False,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    tensor_parallel_size=None,\n    data_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n    serve_module=None,\n    enable_reasoning=None,\n    reasoning_parser=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
+    "text": "Name\nDescription\n\n\n\n\nfixed_fa_peft_integration_check\nPEFT usually casts the layer norms in float32 for training stability reasons\n\n\n\n\n\nmonkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(\n    query,\n    key,\n    value,\n    target_dtype=None,\n    preferred_dtype=None,\n)\nPEFT usually casts the layer norms in float32 for training stability reasons\ntherefore the input hidden states gets silently casted in float32. Hence, we need\ncast them back in float16 / bfloat16 just to be sure everything works as expected.\nThis might slowdown training & inference so it is recommended to not cast the LayerNorms!\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nquery\ntorch.Tensor\nInput query states to be passed to Flash Attention API\nrequired\n\n\nkey\ntorch.Tensor\nInput key states to be passed to Flash Attention API\nrequired\n\n\nvalue\ntorch.Tensor\nInput value states to be passed to Flash Attention API\nrequired\n\n\ntarget_dtype\ntorch.dtype, optional\nThe dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype.\nNone\n\n\npreferred_dtype\ntorch.dtype, optional\nThe preferred dtype to convert the attention tensors to regardless of the target dtype.\nNone"
   },
   {
-    "objectID": "docs/api/cli.inference.html",
-    "href": "docs/api/cli.inference.html",
-    "title": "cli.inference",
+    "objectID": "docs/api/utils.data.sft.html",
+    "href": "docs/api/utils.data.sft.html",
+    "title": "utils.data.sft",
     "section": "",
-    "text": "cli.inference\nCLI to run inference on a trained model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat\ntemplate is (optionally) applied, and the model specified in the axolotl config is\nused to generate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
+    "text": "utils.data.sft\nData handling specific to SFT.\n\n\n\n\n\nName\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
   },
   {
-    "objectID": "docs/api/cli.inference.html#functions",
-    "href": "docs/api/cli.inference.html#functions",
-    "title": "cli.inference",
+    "objectID": "docs/api/utils.data.sft.html#functions",
+    "href": "docs/api/utils.data.sft.html#functions",
+    "title": "utils.data.sft",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat\ntemplate is (optionally) applied, and the model specified in the axolotl config is\nused to generate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
+    "text": "Name\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
   },
   {
-    "objectID": "docs/api/cli.utils.load.html",
-    "href": "docs/api/cli.utils.load.html",
-    "title": "cli.utils.load",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
+    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
     "section": "",
-    "text": "cli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
+    "text": "monkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\n\n\n\nName\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
   },
   {
-    "objectID": "docs/api/cli.utils.load.html#functions",
-    "href": "docs/api/cli.utils.load.html#functions",
-    "title": "cli.utils.load",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
+    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
+    "text": "Name\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
   },
   {
-    "objectID": "docs/api/cli.preprocess.html",
-    "href": "docs/api/cli.preprocess.html",
-    "title": "cli.preprocess",
+    "objectID": "docs/api/monkeypatch.llama_expand_mask.html",
+    "href": "docs/api/monkeypatch.llama_expand_mask.html",
+    "title": "monkeypatch.llama_expand_mask",
     "section": "",
-    "text": "cli.preprocess\nCLI to run preprocessing of a dataset.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
+    "text": "monkeypatch.llama_expand_mask\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf"
   },
   {
-    "objectID": "docs/api/cli.preprocess.html#functions",
-    "href": "docs/api/cli.preprocess.html#functions",
-    "title": "cli.preprocess",
+    "objectID": "docs/api/loaders.tokenizer.html",
+    "href": "docs/api/loaders.tokenizer.html",
+    "title": "loaders.tokenizer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
+    "text": "loaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
   },
   {
-    "objectID": "docs/api/utils.callbacks.profiler.html",
-    "href": "docs/api/utils.callbacks.profiler.html",
-    "title": "utils.callbacks.profiler",
+    "objectID": "docs/api/loaders.tokenizer.html#functions",
+    "href": "docs/api/loaders.tokenizer.html#functions",
+    "title": "loaders.tokenizer",
     "section": "",
-    "text": "utils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\n\n\n\nName\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
+    "text": "Name\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
   },
   {
-    "objectID": "docs/api/utils.callbacks.profiler.html#classes",
-    "href": "docs/api/utils.callbacks.profiler.html#classes",
-    "title": "utils.callbacks.profiler",
+    "objectID": "docs/api/utils.tokenization.html",
+    "href": "docs/api/utils.tokenization.html",
+    "title": "utils.tokenization",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
+    "text": "utils.tokenization\nModule for tokenization utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
   },
   {
-    "objectID": "docs/api/utils.callbacks.perplexity.html",
-    "href": "docs/api/utils.callbacks.perplexity.html",
-    "title": "utils.callbacks.perplexity",
+    "objectID": "docs/api/utils.tokenization.html#functions",
+    "href": "docs/api/utils.tokenization.html#functions",
+    "title": "utils.tokenization",
     "section": "",
-    "text": "utils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
+    "text": "Name\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
   },
   {
-    "objectID": "docs/api/utils.callbacks.perplexity.html#classes",
-    "href": "docs/api/utils.callbacks.perplexity.html#classes",
-    "title": "utils.callbacks.perplexity",
+    "objectID": "docs/api/datasets.html",
+    "href": "docs/api/datasets.html",
+    "title": "datasets",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
+    "text": "datasets\nModule containing dataset functionality.\nWe want this to be a wrapper for an existing dataset that we have loaded. Lets use the\nconcept of middlewares to wrap each dataset. We’ll use the collators later on to pad the\ndatasets.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
   },
   {
-    "objectID": "docs/api/core.chat.format.chatml.html",
-    "href": "docs/api/core.chat.format.chatml.html",
-    "title": "core.chat.format.chatml",
+    "objectID": "docs/api/datasets.html#classes",
+    "href": "docs/api/datasets.html#classes",
+    "title": "datasets",
     "section": "",
-    "text": "core.chat.format.chatml\ncore.chat.format.chatml\nChatML transformation functions for MessageContents"
+    "text": "Name\nDescription\n\n\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
   },
   {
-    "objectID": "docs/api/integrations.grokfast.optimizer.html",
-    "href": "docs/api/integrations.grokfast.optimizer.html",
-    "title": "integrations.grokfast.optimizer",
+    "objectID": "docs/api/cli.utils.sweeps.html",
+    "href": "docs/api/cli.utils.sweeps.html",
+    "title": "cli.utils.sweeps",
     "section": "",
-    "text": "integrations.grokfast.optimizer\nintegrations.grokfast.optimizer"
+    "text": "cli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\n\n\n\nName\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.utils.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, Any]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
   },
   {
-    "objectID": "docs/api/integrations.kd.trainer.html",
-    "href": "docs/api/integrations.kd.trainer.html",
-    "title": "integrations.kd.trainer",
+    "objectID": "docs/api/cli.utils.sweeps.html#functions",
+    "href": "docs/api/cli.utils.sweeps.html#functions",
+    "title": "cli.utils.sweeps",
     "section": "",
-    "text": "integrations.kd.trainer\nKD trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+    "text": "Name\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.utils.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, Any]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
   },
   {
-    "objectID": "docs/api/integrations.kd.trainer.html#classes",
-    "href": "docs/api/integrations.kd.trainer.html#classes",
-    "title": "integrations.kd.trainer",
+    "objectID": "docs/api/utils.schedulers.html",
+    "href": "docs/api/utils.schedulers.html",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+    "text": "utils.schedulers\nModule for custom LRScheduler class\n\n\n\n\n\nName\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
   },
   {
-    "objectID": "docs/api/monkeypatch.unsloth_.html",
-    "href": "docs/api/monkeypatch.unsloth_.html",
-    "title": "monkeypatch.unsloth_",
+    "objectID": "docs/api/utils.schedulers.html#classes",
+    "href": "docs/api/utils.schedulers.html#classes",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "monkeypatch.unsloth_\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations"
+    "text": "Name\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0"
   },
   {
-    "objectID": "docs/api/core.chat.format.llama3x.html",
-    "href": "docs/api/core.chat.format.llama3x.html",
-    "title": "core.chat.format.llama3x",
+    "objectID": "docs/api/utils.schedulers.html#functions",
+    "href": "docs/api/utils.schedulers.html#functions",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "core.chat.format.llama3x\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents"
+    "text": "Name\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
   },
   {
-    "objectID": "docs/models/gemma3n.html",
-    "href": "docs/models/gemma3n.html",
-    "title": "Gemma 3n",
+    "objectID": "docs/models/mimo.html",
+    "href": "docs/models/mimo.html",
+    "title": "MiMo",
     "section": "",
-    "text": "Gemma-3n is a family of multimodal models from Google found on HuggingFace. This guide shows how to fine-tune it with Axolotl.",
+    "text": "MiMo is a family of models trained from scratch for reasoning tasks, incorporating Multiple-Token Prediction (MTP) as an additional training objective for enhanced performance and faster inference. Pre-trained on ~25T tokens with a three-stage data mixture strategy and optimized reasoning pattern density.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Gemma 3n"
+      "MiMo"
     ]
   },
   {
-    "objectID": "docs/models/gemma3n.html#getting-started",
-    "href": "docs/models/gemma3n.html#getting-started",
-    "title": "Gemma 3n",
+    "objectID": "docs/models/mimo.html#getting-started",
+    "href": "docs/models/mimo.html#getting-started",
+    "title": "MiMo",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nIn addition to Axolotl’s requirements, Gemma-3n requires:\n\npip3 install timm==1.0.17\n\n# for loading audio data\npip3 install librosa==0.11.0\n\nDownload sample dataset files\n\n# for text + vision + audio only\nwget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/African_elephant.jpg\nwget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/En-us-African_elephant.oga\n\nRun the finetuning example:\n\n# text only\naxolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml\n\n# text + vision\naxolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml\n\n# text + vision + audio\naxolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml\nLet us know how it goes. Happy finetuning! 🚀\nWARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTIPS\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\nThe multimodal dataset format follows the OpenAI multi-content Messages format as seen here.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nRun the finetuning example:\naxolotl train examples/mimo/mimo-7b-qlora.yaml\n\nThis config uses about 17.2 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀\n\nTips\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Gemma 3n"
+      "MiMo"
     ]
   },
   {
-    "objectID": "docs/models/gemma3n.html#optimization-guides",
-    "href": "docs/models/gemma3n.html#optimization-guides",
-    "title": "Gemma 3n",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Gemma 3n"
-    ]
-  },
-  {
-    "objectID": "docs/models/gemma3n.html#related-resources",
-    "href": "docs/models/gemma3n.html#related-resources",
-    "title": "Gemma 3n",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nGemma 3n Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Gemma 3n"
-    ]
-  },
-  {
-    "objectID": "docs/models/qwen3-next.html",
-    "href": "docs/models/qwen3-next.html",
-    "title": "Qwen 3 Next",
-    "section": "",
-    "text": "Qwen3-Next represents the next-generation foundation models optimized for extreme context length and large-scale parameter efficiency. The series introduces architectural innovations including Hybrid Attention (Gated DeltaNet + Gated Attention), High-Sparsity MoE with 1:50 activation ratio, and Multi-Token Prediction for enhanced performance and inference acceleration.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Qwen 3 Next"
-    ]
-  },
-  {
-    "objectID": "docs/models/qwen3-next.html#getting-started",
-    "href": "docs/models/qwen3-next.html#getting-started",
-    "title": "Qwen 3 Next",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as Qwen3-Next is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nInstall Qwen3-Next transformers commit\n\npip3 uninstall -y transformers && pip3 install \"git+https://github.com/huggingface/transformers.git@b9282355bea846b54ed850a066901496b19da654\"\n\nInstall FLA for improved performance\n\npip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.3.2\n\nRun the finetuning example:\n\naxolotl train examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml\nThis config uses about 45.62 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, you can experiment with temperature: 0.7, top_p: 0.8, top_k: 20, and min_p: 0.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config. See Multi-GPU section below.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Qwen 3 Next"
-    ]
-  },
-  {
-    "objectID": "docs/models/qwen3-next.html#optimization-guides",
-    "href": "docs/models/qwen3-next.html#optimization-guides",
-    "title": "Qwen 3 Next",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Qwen 3 Next"
-    ]
-  },
-  {
-    "objectID": "docs/models/qwen3-next.html#related-resources",
-    "href": "docs/models/qwen3-next.html#related-resources",
-    "title": "Qwen 3 Next",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nQwen3-Next Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Qwen 3 Next"
-    ]
-  },
-  {
-    "objectID": "docs/models/index.html",
-    "href": "docs/models/index.html",
-    "title": "Model Guides",
-    "section": "",
-    "text": "Model Guides\nBelow are the curated examples for training various model architectures:\n\nKimi Linear\nPlano Orchestrator\nMiMo\nInternVL 3.5\nOLMo 3\nTrinity\nArcee AFM\nMinistral3\nMinistral 3 Thinking\nMinistral 3 Vision\nMagistral\nMagistral Thinking\nMagistral Vision\nMinistral\nMistral Small 3.1/3.2\nVoxtral\nDevstral\nMistral 7B\nLlama 4\nLlama 2\nQwen 3 Next\nQwen 3\nGemma 3n\nApertus\nGPT-OSS\nSeed-OSS\nPhi\nSmolVLM 2\nGranite 4\nLiquid Foundation Models 2\nHunyuan\nJamba\nOrpheus"
-  },
-  {
-    "objectID": "docs/models/magistral/think.html",
-    "href": "docs/models/magistral/think.html",
-    "title": "Magistral Thinking",
-    "section": "",
-    "text": "This guide covers fine-tuning Magistral Small 2507 with thinking capabilities using Axolotl. The thinking model enables explicit Chain-of-Thought reasoning with separate thinking and response sections.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Magistral",
-      "Magistral Thinking"
-    ]
-  },
-  {
-    "objectID": "docs/models/magistral/think.html#prerequisites",
-    "href": "docs/models/magistral/think.html#prerequisites",
-    "title": "Magistral Thinking",
-    "section": "Prerequisites",
-    "text": "Prerequisites\nBefore starting, ensure you have:\n\nInstalled Axolotl (see main README)",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Magistral",
-      "Magistral Thinking"
-    ]
-  },
-  {
-    "objectID": "docs/models/magistral/think.html#getting-started",
-    "href": "docs/models/magistral/think.html#getting-started",
-    "title": "Magistral Thinking",
-    "section": "Getting Started",
-    "text": "Getting Started\nRun the thinking model fine-tuning:\naxolotl train examples/magistral/think/magistral-small-think-qlora.yaml\nThis config uses about 19.1 GiB VRAM.\n\nTips\n\nDataset uses multi-content format with type: thinking support. See Dataset Format below.\nYou cannot mix content: str and content: list[dict], otherwise, dataset loading will fail. Keep it consistent.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Magistral",
-      "Magistral Thinking"
-    ]
-  },
-  {
-    "objectID": "docs/models/magistral/think.html#dataset-format",
-    "href": "docs/models/magistral/think.html#dataset-format",
-    "title": "Magistral Thinking",
-    "section": "Dataset Format",
-    "text": "Dataset Format\nThe thinking model requires the multi-content dataset format with support for an extra role: thinking within system and assistant messages.\nExample format:\n{\n    \"messages\": [\n        {\n            \"role\": \"system\",\n            \"content\": [\n                { \"type\": \"text\", \"text\": \"{SYSTEM_PROMPT}\"}\n            ]\n        },\n        {\n            \"role\": \"user\",\n            \"content\": [\n                { \"type\": \"text\", \"text\": \"Solve this step by step: What is 15% of 240?\"}\n            ]\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": [\n                {\n                    \"type\": \"thinking\",\n                    \"thinking\": \"I need to calculate 15% of 240. First, I'll convert 15% to decimal: 0.15. Then multiply: 0.15 × 240 = 36.\"\n                },\n                {\n                    \"type\": \"text\",\n                    \"text\": \"To find 15% of 240, I'll multiply 240 by 0.15:\\n\\n240 × 0.15 = 36\\n\\nTherefore, 15% of 240 is 36.\"\n                }\n            ]\n        }\n    ]\n}\n\nAdvanced Options\nThe thinking section supports an optional closed parameter:\n{\n    \"type\": \"thinking\",\n    \"thinking\": \"Internal reasoning here...\",\n    \"closed\": true  // Default: true, controls adding the closing [/THINK] tag\n}",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Magistral",
-      "Magistral Thinking"
-    ]
-  },
-  {
-    "objectID": "docs/models/kimi-linear.html",
-    "href": "docs/models/kimi-linear.html",
-    "title": "Kimi Linear",
-    "section": "",
-    "text": "Kimi Linear is a MoE model (48B total, 3B active) by MoonshotAI using a hybrid linear attention architecture to achieve a 1M token context length. It uses Kimi Delta Attention (KDA), a refined version of Gated DeltaNet that reduces KV cache size by up to 75% and boosts decoding throughput by up to 6x for long contexts.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.\nNote: Axolotl uses experimental training code for Kimi Linear as their original modeling code is inference-only.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Kimi Linear"
-    ]
-  },
-  {
-    "objectID": "docs/models/kimi-linear.html#getting-started",
-    "href": "docs/models/kimi-linear.html#getting-started",
-    "title": "Kimi Linear",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall CCE via docs\nRun the finetuning example:\naxolotl train examples/kimi-linear/kimi-48b-lora.yaml\n\nThis config uses about 98.7GiB VRAM.\nLet us know how it goes. Happy finetuning!\n\nTIPS\n\nKimi Linear requires trust_remote_code: true.\nYou can run a full finetuning by removing the adapter: lora and load_in_8bit: true.\nRead more on how to load your own dataset at docs\nThe dataset format follows the OpenAI Messages format as seen here",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Kimi Linear"
-    ]
-  },
-  {
-    "objectID": "docs/models/kimi-linear.html#optimization-guides",
-    "href": "docs/models/kimi-linear.html#optimization-guides",
-    "title": "Kimi Linear",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\nSee 👉 docs.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Kimi Linear"
-    ]
-  },
-  {
-    "objectID": "docs/models/kimi-linear.html#limitations",
-    "href": "docs/models/kimi-linear.html#limitations",
-    "title": "Kimi Linear",
-    "section": "Limitations",
-    "text": "Limitations\nThis is not yet compatible with MoE kernels from transformers v5.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Kimi Linear"
-    ]
-  },
-  {
-    "objectID": "docs/models/kimi-linear.html#related-resources",
-    "href": "docs/models/kimi-linear.html#related-resources",
-    "title": "Kimi Linear",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nKimi Linear Paper\nKimi Linear GitHub\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Kimi Linear"
-    ]
-  },
-  {
-    "objectID": "docs/models/internvl3_5.html",
-    "href": "docs/models/internvl3_5.html",
-    "title": "InternVL 3.5",
-    "section": "",
-    "text": "InternVL 3.5 is a family of powerful vision-language models supporting dynamic resolution and multi-image understanding by OpenGV. It features a ViT-style vision encoder and strong language model backbone for tasks like visual question answering, OCR, and scene text understanding.\nThis guide shows how to fine-tune it with Axolotl.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "InternVL 3.5"
-    ]
-  },
-  {
-    "objectID": "docs/models/internvl3_5.html#getting-started",
-    "href": "docs/models/internvl3_5.html#getting-started",
-    "title": "InternVL 3.5",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall timm for vision model support:\npip install timm==1.0.19\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/internvl3_5/internvl3_5-8b-qlora.yml\n\nThis config uses about 8.21 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀\n\nTips\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the multi-modal format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "InternVL 3.5"
-    ]
-  },
-  {
-    "objectID": "docs/models/internvl3_5.html#optimization-guides",
-    "href": "docs/models/internvl3_5.html#optimization-guides",
-    "title": "InternVL 3.5",
+    "objectID": "docs/models/mimo.html#optimization-guides",
+    "href": "docs/models/mimo.html#optimization-guides",
+    "title": "MiMo",
     "section": "Optimization Guides",
     "text": "Optimization Guides\nPlease check the Optimizations doc.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "InternVL 3.5"
+      "MiMo"
     ]
   },
   {
-    "objectID": "docs/models/internvl3_5.html#related-resources",
-    "href": "docs/models/internvl3_5.html#related-resources",
-    "title": "InternVL 3.5",
+    "objectID": "docs/models/mimo.html#limitations",
+    "href": "docs/models/mimo.html#limitations",
+    "title": "MiMo",
+    "section": "Limitations",
+    "text": "Limitations\nCut Cross Entropy (CCE): Currently not supported. We plan to include CCE support for MiMo in the near future.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "MiMo"
+    ]
+  },
+  {
+    "objectID": "docs/models/mimo.html#related-resources",
+    "href": "docs/models/mimo.html#related-resources",
+    "title": "MiMo",
     "section": "Related Resources",
-    "text": "Related Resources\n\nInternVL Paper\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "text": "Related Resources\n\nMiMo Paper\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "InternVL 3.5"
+      "MiMo"
     ]
   },
   {
-    "objectID": "docs/models/arcee.html",
-    "href": "docs/models/arcee.html",
-    "title": "Arcee AFM",
+    "objectID": "docs/models/jamba.html",
+    "href": "docs/models/jamba.html",
+    "title": "Jamba",
     "section": "",
-    "text": "Arcee Foundation Models (AFM) are a family of 4.5B parameter open weight models trained by Arcee.ai.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.\nThanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the AFM model.",
+    "text": "✅ qlora w/ deepspeed Zero-2 needs at least 2x GPUs and\n\n35GiB VRAM per GPU w minimal context length\n56GiB VRAM per GPU (w multipack enabled)\n\n✅ qlora w/ deepspeed Zero-3 needs at least 2x GPUs and 67GiB VRAM (wtf?)\n✅ qlora single-gpu, ~51GiB VRAM\n✅ multipack\n✅ FSDP\n❓ 8-bit LoRA",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Arcee AFM"
+      "Jamba"
     ]
   },
   {
-    "objectID": "docs/models/arcee.html#getting-started",
-    "href": "docs/models/arcee.html#getting-started",
-    "title": "Arcee AFM",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as AFM is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/arcee/afm-4.5b-qlora.yaml\nThis config uses about 7.8GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official Arcee.ai team recommends top_p: 0.95, temperature: 0.5, top_k: 50, and repeat_penalty: 1.1.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Arcee AFM"
-    ]
-  },
-  {
-    "objectID": "docs/models/arcee.html#optimization-guides",
-    "href": "docs/models/arcee.html#optimization-guides",
-    "title": "Arcee AFM",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Arcee AFM"
-    ]
-  },
-  {
-    "objectID": "docs/models/arcee.html#related-resources",
-    "href": "docs/models/arcee.html#related-resources",
-    "title": "Arcee AFM",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nAFM Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Arcee AFM"
-    ]
-  },
-  {
-    "objectID": "docs/models/LiquidAI.html",
-    "href": "docs/models/LiquidAI.html",
-    "title": "Liquid Foundation Models 2",
+    "objectID": "docs/models/magistral/vision.html",
+    "href": "docs/models/magistral/vision.html",
+    "title": "Magistral Vision",
     "section": "",
-    "text": "Liquid Foundation Models 2 (LFM2) are a family of small, open-weight models from Liquid AI focused on quality, speed, and memory efficiency. Liquid AI released text-only LFM2 and text+vision LFM2-VL models.\nLFM2 features a new hybrid Liquid architecture with multiplicative gates, short-range convolutions, and grouped query attention, enabling fast training and inference.\nThis guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl.\nThanks to the team at LiquidAI for giving us early access to prepare for these releases.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Liquid Foundation Models 2"
-    ]
-  },
-  {
-    "objectID": "docs/models/LiquidAI.html#getting-started",
-    "href": "docs/models/LiquidAI.html#getting-started",
-    "title": "Liquid Foundation Models 2",
-    "section": "Getting Started",
-    "text": "Getting Started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\npip3 install packaging setuptools wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\nRun one of the finetuning examples below.\nLFM2\n# FFT SFT (1x48GB @ 25GiB)\naxolotl train examples/LiquidAI/lfm2-350m-fft.yaml\nLFM2-VL\n# LoRA SFT (1x48GB @ 2.7GiB)\naxolotl train examples/LiquidAI/lfm2-vl-lora.yaml\nLFM2-MoE\npip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6\n\n# LoRA SFT (1x48GB @ 16.2GiB)\naxolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml\n\n\nTIPS\n\nInstallation Error: If you encounter ImportError: ... undefined symbol ... or ModuleNotFoundError: No module named 'causal_conv1d_cuda', the causal-conv1d package may have been installed incorrectly. Try uninstalling it:\npip uninstall -y causal-conv1d\nDataset Loading: Read more on how to load your own dataset in our documentation.\nDataset Formats:\n\nFor LFM2 models, the dataset format follows the OpenAI Messages format as seen here.\nFor LFM2-VL models, Axolotl follows the multi-content Messages format. See our Multimodal docs for details.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Liquid Foundation Models 2"
-    ]
-  },
-  {
-    "objectID": "docs/models/LiquidAI.html#optimization-guides",
-    "href": "docs/models/LiquidAI.html#optimization-guides",
-    "title": "Liquid Foundation Models 2",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\n\nOptimizations Guide",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Liquid Foundation Models 2"
-    ]
-  },
-  {
-    "objectID": "docs/models/LiquidAI.html#related-resources",
-    "href": "docs/models/LiquidAI.html#related-resources",
-    "title": "Liquid Foundation Models 2",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nLFM2 Blog\nLFM2-VL Blog\nLFM2-MoE Blog\nAxolotl Docs\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Liquid Foundation Models 2"
-    ]
-  },
-  {
-    "objectID": "docs/models/magistral.html",
-    "href": "docs/models/magistral.html",
-    "title": "Magistral",
-    "section": "",
-    "text": "Magistral Small is a 24B parameter opensource model from MistralAI found on HuggingFace at 2506, 2507 (see Thinking), and 2509 (see Vision). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.\nMistralAI has also released a proprietary medium-sized version called Magistral Medium.\nThanks to the team at MistralAI for giving us early access to prepare for these releases.",
+    "text": "This guide covers fine-tuning Magistral Small 2509 with vision capabilities using Axolotl.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
       "Magistral",
-      "Magistral"
+      "Magistral Vision"
     ]
   },
   {
-    "objectID": "docs/models/magistral.html#getting-started",
-    "href": "docs/models/magistral.html#getting-started",
-    "title": "Magistral",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.7.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nInstall Cut Cross Entropy to reduce training VRAM usage\n\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/magistral/magistral-small-qlora.yaml\nThis config uses about 24GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nThinking\nMistralAI has released their 2507 model with thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.\n📚 See the Thinking fine-tuning guide →\n\n\nVision\nMistralAI has released their 2509 model with vision capabilities.\n📚 See the Vision fine-tuning guide →\n\n\nTips\n\nWe recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo’s files titled SYSTEM_PROMPT.txt.\nFor inference, the official MistralAI team recommends top_p: 0.95 and temperature: 0.7 with max_tokens: 40960.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Magistral",
-      "Magistral"
-    ]
-  },
-  {
-    "objectID": "docs/models/magistral.html#optimization-guides",
-    "href": "docs/models/magistral.html#optimization-guides",
-    "title": "Magistral",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Magistral",
-      "Magistral"
-    ]
-  },
-  {
-    "objectID": "docs/models/magistral.html#limitations",
-    "href": "docs/models/magistral.html#limitations",
-    "title": "Magistral",
-    "section": "Limitations",
-    "text": "Limitations\nWe only support the mistral-common tokenizer for Supervised Fine-tuning at the moment and for type: chat_template only.\nIn addition, we do not support overriding tokens yet.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Magistral",
-      "Magistral"
-    ]
-  },
-  {
-    "objectID": "docs/models/magistral.html#related-resources",
-    "href": "docs/models/magistral.html#related-resources",
-    "title": "Magistral",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nMistralAI Magistral Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Magistral",
-      "Magistral"
-    ]
-  },
-  {
-    "objectID": "docs/models/magistral.html#future-work",
-    "href": "docs/models/magistral.html#future-work",
-    "title": "Magistral",
-    "section": "Future Work",
-    "text": "Future Work\n\nAdd parity to Preference Tuning, RL, etc.\nAdd parity to other tokenizer configs like overriding tokens.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Magistral",
-      "Magistral"
-    ]
-  },
-  {
-    "objectID": "docs/models/voxtral.html",
-    "href": "docs/models/voxtral.html",
-    "title": "Voxtral",
-    "section": "",
-    "text": "Voxtral is a 3B/24B parameter opensource model from MistralAI found on HuggingFace. This guide shows how to fine-tune it with Axolotl.\nThanks to the team at MistralAI for giving us early access to prepare for this release.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Voxtral"
-    ]
-  },
-  {
-    "objectID": "docs/models/voxtral.html#getting-started",
-    "href": "docs/models/voxtral.html#getting-started",
-    "title": "Voxtral",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nPlease install the below.\n\n# audio\npip3 install librosa==0.11.0\npip3 install 'mistral_common[audio]==1.8.3'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nDownload sample dataset files\n\n# for text + audio only\nwget https://huggingface.co/datasets/Nanobit/text-audio-2k-test/resolve/main/En-us-African_elephant.oga\n\nRun the finetuning example:\n\n# text only\naxolotl train examples/voxtral/voxtral-mini-qlora.yml\n\n# text + audio\naxolotl train examples/voxtral/voxtral-mini-audio-qlora.yml\nThese configs use about 4.8 GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official MistralAI team recommends temperature: 0.2 and top_p: 0.95 for audio understanding and temperature: 0.0 for transcription.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\nThe multimodal dataset format follows the OpenAI multi-content Messages format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Voxtral"
-    ]
-  },
-  {
-    "objectID": "docs/models/voxtral.html#optimization-guides",
-    "href": "docs/models/voxtral.html#optimization-guides",
-    "title": "Voxtral",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Voxtral"
-    ]
-  },
-  {
-    "objectID": "docs/models/voxtral.html#limitations",
-    "href": "docs/models/voxtral.html#limitations",
-    "title": "Voxtral",
-    "section": "Limitations",
-    "text": "Limitations\nWe only support the mistral-common tokenizer for Supervised Fine-tuning at the moment and for type: chat_template only.\nIn addition, we do not support overriding tokens yet.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Voxtral"
-    ]
-  },
-  {
-    "objectID": "docs/models/voxtral.html#related-resources",
-    "href": "docs/models/voxtral.html#related-resources",
-    "title": "Voxtral",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nMistralAI Magistral Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Voxtral"
-    ]
-  },
-  {
-    "objectID": "docs/models/voxtral.html#future-work",
-    "href": "docs/models/voxtral.html#future-work",
-    "title": "Voxtral",
-    "section": "Future Work",
-    "text": "Future Work\n\nAdd parity to Preference Tuning, RL, etc.\nAdd parity to other tokenizer configs like overriding tokens.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Voxtral"
-    ]
-  },
-  {
-    "objectID": "docs/models/trinity.html",
-    "href": "docs/models/trinity.html",
-    "title": "Trinity",
-    "section": "",
-    "text": "Trinity is a family of open weight MoE models trained by Arcee.ai.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Trinity"
-    ]
-  },
-  {
-    "objectID": "docs/models/trinity.html#getting-started",
-    "href": "docs/models/trinity.html#getting-started",
-    "title": "Trinity",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the main from the installation guide.\nRun the finetuning example:\naxolotl train examples/trinity/trinity-nano-preview-qlora.yaml\n\nThis config uses about 24.9 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official Arcee.ai team recommends top_p: 0.75, temperature: 0.15, top_k: 50, and min_p: 0.06.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Trinity"
-    ]
-  },
-  {
-    "objectID": "docs/models/trinity.html#optimization-guides",
-    "href": "docs/models/trinity.html#optimization-guides",
-    "title": "Trinity",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\nPlease check the Optimizations doc.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Trinity"
-    ]
-  },
-  {
-    "objectID": "docs/models/trinity.html#limitations",
-    "href": "docs/models/trinity.html#limitations",
-    "title": "Trinity",
-    "section": "Limitations",
-    "text": "Limitations\nCut Cross Entropy (CCE): Currently not supported. We plan to include CCE support for Trinity in the near future.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Trinity"
-    ]
-  },
-  {
-    "objectID": "docs/models/trinity.html#related-resources",
-    "href": "docs/models/trinity.html#related-resources",
-    "title": "Trinity",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nTrinity Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Trinity"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral.html",
-    "href": "docs/models/ministral.html",
-    "title": "Ministral",
-    "section": "",
-    "text": "Ministral is a family of openweight models from MistralAI found on HuggingFace. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral.html#getting-started",
-    "href": "docs/models/ministral.html#getting-started",
-    "title": "Ministral",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/ministral/ministral-small-qlora.yaml\n\nThis config uses about 8.76 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTips\n\nWe recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo’s files titled SYSTEM_PROMPT.txt.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral.html#optimization-guides",
-    "href": "docs/models/ministral.html#optimization-guides",
-    "title": "Ministral",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\nPlease check the Optimizations doc.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral.html#limitations",
-    "href": "docs/models/ministral.html#limitations",
-    "title": "Ministral",
-    "section": "Limitations",
-    "text": "Limitations\nWe only support the mistral-common tokenizer for Supervised Fine-tuning at the moment and for type: chat_template only.\nIn addition, we do not support overriding tokens yet.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral.html#related-resources",
-    "href": "docs/models/ministral.html#related-resources",
-    "title": "Ministral",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nMistralAI Ministral Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral.html#future-work",
-    "href": "docs/models/ministral.html#future-work",
-    "title": "Ministral",
-    "section": "Future Work",
-    "text": "Future Work\n\nAdd parity to Preference Tuning, RL, etc.\nAdd parity to other tokenizer configs like overriding tokens.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral"
-    ]
-  },
-  {
-    "objectID": "docs/models/llama-4.html",
-    "href": "docs/models/llama-4.html",
-    "title": "Llama 4",
-    "section": "",
-    "text": "While Flash Attention to support is “enabled” for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Llama 4"
-    ]
-  },
-  {
-    "objectID": "docs/models/llama-4.html#flash-attention-vs-flex-attention",
-    "href": "docs/models/llama-4.html#flash-attention-vs-flex-attention",
-    "title": "Llama 4",
-    "section": "",
-    "text": "While Flash Attention to support is “enabled” for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Llama 4"
-    ]
-  },
-  {
-    "objectID": "docs/models/llama-4.html#available-examples",
-    "href": "docs/models/llama-4.html#available-examples",
-    "title": "Llama 4",
-    "section": "Available Examples",
-    "text": "Available Examples\n\nLlama 4 Scout 17Bx16Experts (109B)\nFlex Attention\n- Text Single GPU (H100) QLoRA\n- Text Multi GPU QLoRA w/ FSDP2\nOur Single H100 implementation for Llama 4 Scout uses only 64.5GB VRAM for post-training with 4k context length @ 519 tokens/second. WandB logs here\nMulti-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @ 280tps/gpu, WandB logs here\n\n\nLlama 4 Maverick 17Bx128Experts (400B)\nComing Soon",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Llama 4"
-    ]
-  },
-  {
-    "objectID": "docs/models/llama-4.html#delinearized-llama-4-models",
-    "href": "docs/models/llama-4.html#delinearized-llama-4-models",
-    "title": "Llama 4",
-    "section": "Delinearized Llama 4 Models",
-    "text": "Delinearized Llama 4 Models\nWe provide a script to delinearize Llama 4 linearized models into regular HuggingFace Llama 4 models.\naxolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir\nNote: This only works with the non-quantized linearized model. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Llama 4"
-    ]
-  },
-  {
-    "objectID": "docs/models/devstral.html",
-    "href": "docs/models/devstral.html",
-    "title": "Devstral",
-    "section": "",
-    "text": "Devstral Small is a 24B parameter opensource model from MistralAI found on HuggingFace Devstral-Small-2505 and Devstral-Small-2507. Devstral-Small-2507 is the latest version of the model and has function calling support.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.\nThe model was fine-tuned ontop of Mistral-Small-3.1 without the vision layer and has a context of up to 128k tokens.\nThanks to the team at MistralAI for giving us early access to prepare for this release.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Devstral"
-    ]
-  },
-  {
-    "objectID": "docs/models/devstral.html#getting-started",
-    "href": "docs/models/devstral.html#getting-started",
-    "title": "Devstral",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nInstall Cut Cross Entropy to reduce training VRAM usage\n\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/devstral/devstral-small-qlora.yml\nThis config uses about 21GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\nLearn how to use function calling with Axolotl at docs.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Devstral"
-    ]
-  },
-  {
-    "objectID": "docs/models/devstral.html#optimization-guides",
-    "href": "docs/models/devstral.html#optimization-guides",
-    "title": "Devstral",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations\nCut Cross Entropy\nLiger Kernel",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Devstral"
-    ]
-  },
-  {
-    "objectID": "docs/models/devstral.html#limitations",
-    "href": "docs/models/devstral.html#limitations",
-    "title": "Devstral",
-    "section": "Limitations",
-    "text": "Limitations\nWe only support the mistral-common tokenizer for Supervised Fine-tuning at the moment and for type: chat_template only.\nIn addition, we do not support overriding tokens yet.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Devstral"
-    ]
-  },
-  {
-    "objectID": "docs/models/devstral.html#related-resources",
-    "href": "docs/models/devstral.html#related-resources",
-    "title": "Devstral",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nMistralAI Devstral Blog\nMistralAI Devstral 1.1 Blog\nAxolotl Docs\nAxolotl GitHub\nAxolotl Website\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Devstral"
-    ]
-  },
-  {
-    "objectID": "docs/models/devstral.html#future-work",
-    "href": "docs/models/devstral.html#future-work",
-    "title": "Devstral",
-    "section": "Future Work",
-    "text": "Future Work\n\nAdd parity to Preference Tuning, RL, Multi-modal, etc.\nAdd parity to other tokenizer configs like overriding tokens.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Devstral"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3.html",
-    "href": "docs/models/ministral3.html",
-    "title": "Ministral3",
-    "section": "",
-    "text": "Ministral3 is a family of open-weight models from MistralAI found on HuggingFace. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.\nPlease see Thinking and Vision for their respective fine-tuning.\nThanks to the team at MistralAI for giving us early access to prepare for these releases.\nNote: This is still experimental given it is based on transformers v5 RC.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral3",
-      "Ministral3"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3.html#getting-started",
-    "href": "docs/models/ministral3.html#getting-started",
-    "title": "Ministral3",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl from source following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nSwap to the Axolotl transformers v5 branch\ncp examples/ministral3/ministral3-3b-qlora.yaml ministral3-3b-qlora.yaml\n\ngit fetch\ngit checkout transformers-v5\n\n# Install packages for transformers v5\npip install -e .\nRun the fine-tuning:\naxolotl train ministral3-3b-qlora.yaml\n\nLet us know how it goes. Happy finetuning! 🚀\n\nTips\n\nWe recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo’s files titled SYSTEM_PROMPT.txt.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\n\n\n\nThinking\nMinistral3 2512 model supports thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.\n📚 See the Thinking fine-tuning guide →\n\n\nVision\nMinistral3 2512 model also supports vision capabilities.\n📚 See the Vision fine-tuning guide →",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral3",
-      "Ministral3"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3.html#optimization-guides",
-    "href": "docs/models/ministral3.html#optimization-guides",
-    "title": "Ministral3",
-    "section": "Optimization Guides",
-    "text": "Optimization Guides\nPlease check the Optimizations doc.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral3",
-      "Ministral3"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3.html#limitations",
-    "href": "docs/models/ministral3.html#limitations",
-    "title": "Ministral3",
-    "section": "Limitations",
-    "text": "Limitations\nWe only support the mistral-common tokenizer for Supervised Fine-tuning at the moment and for type: chat_template only.\nIn addition, we do not support overriding tokens yet.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral3",
-      "Ministral3"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3.html#related-resources",
-    "href": "docs/models/ministral3.html#related-resources",
-    "title": "Ministral3",
-    "section": "Related Resources",
-    "text": "Related Resources\n\nMistralAI Mistral3 Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral3",
-      "Ministral3"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3.html#future-work",
-    "href": "docs/models/ministral3.html#future-work",
-    "title": "Ministral3",
-    "section": "Future Work",
-    "text": "Future Work\n\nAdd parity to Preference Tuning, RL, etc.\nAdd parity to other tokenizer configs like overriding tokens.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral3",
-      "Ministral3"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3/vision.html",
-    "href": "docs/models/ministral3/vision.html",
-    "title": "Ministral 3 Vision",
-    "section": "",
-    "text": "This guide covers fine-tuning Ministral3 2512 with vision capabilities using Axolotl.",
-    "crumbs": [
-      "Getting Started",
-      "Model Guides",
-      "Ministral3",
-      "Ministral 3 Vision"
-    ]
-  },
-  {
-    "objectID": "docs/models/ministral3/vision.html#prerequisites",
-    "href": "docs/models/ministral3/vision.html#prerequisites",
-    "title": "Ministral 3 Vision",
+    "objectID": "docs/models/magistral/vision.html#prerequisites",
+    "href": "docs/models/magistral/vision.html#prerequisites",
+    "title": "Magistral Vision",
     "section": "Prerequisites",
     "text": "Prerequisites\nBefore starting, ensure you have:\n\nInstalled Axolotl from source (see main README)",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Ministral3",
-      "Ministral 3 Vision"
+      "Magistral",
+      "Magistral Vision"
     ]
   },
   {
-    "objectID": "docs/models/ministral3/vision.html#getting-started",
-    "href": "docs/models/ministral3/vision.html#getting-started",
-    "title": "Ministral 3 Vision",
+    "objectID": "docs/models/magistral/vision.html#getting-started",
+    "href": "docs/models/magistral/vision.html#getting-started",
+    "title": "Magistral Vision",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall the required vision lib:\nbash     pip install 'mistral-common[opencv]==1.8.6'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/ministral3/vision/ministral3-3b-vision-qlora.yml\n\nWARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTips\nKey differences from text-only model:\n- Multi-modal dataset format required\n- Sample packing not supported",
+    "text": "Getting started\n\nInstall the required vision lib:\nbash     pip install 'mistral-common[opencv]==1.8.5'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/magistral/vision/magistral-small-vision-24B-qlora.yml\n\nThis config uses about 17GiB VRAM.\nWARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTips\nKey differences from text-only model:\n- max_tokens: 131072 for inference\n- Multi-modal dataset format required\n- Sample packing not supported",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Ministral3",
-      "Ministral 3 Vision"
+      "Magistral",
+      "Magistral Vision"
     ]
   },
   {
-    "objectID": "docs/models/ministral3/vision.html#dataset-format",
-    "href": "docs/models/ministral3/vision.html#dataset-format",
-    "title": "Ministral 3 Vision",
+    "objectID": "docs/models/magistral/vision.html#dataset-format",
+    "href": "docs/models/magistral/vision.html#dataset-format",
+    "title": "Magistral Vision",
     "section": "Dataset Format",
     "text": "Dataset Format\nThe vision model requires multi-modal dataset format as documented here.\nOne exception is that, passing \"image\": PIL.Image is not supported. MistralTokenizer only supports path, url, and base64 for now.\nExample:\n{\n    \"messages\": [\n        {\"role\": \"system\", \"content\": [{ \"type\": \"text\", \"text\": \"{SYSTEM_PROMPT}\"}]},\n        {\"role\": \"user\", \"content\": [\n            { \"type\": \"text\", \"text\": \"What's in this image?\"},\n            {\"type\": \"image\", \"path\": \"path/to/image.jpg\" }\n        ]},\n        {\"role\": \"assistant\", \"content\": [{ \"type\": \"text\", \"text\": \"...\" }]},\n    ],\n}",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Ministral3",
-      "Ministral 3 Vision"
+      "Magistral",
+      "Magistral Vision"
     ]
   },
   {
-    "objectID": "docs/models/ministral3/vision.html#limitations",
-    "href": "docs/models/ministral3/vision.html#limitations",
-    "title": "Ministral 3 Vision",
+    "objectID": "docs/models/magistral/vision.html#limitations",
+    "href": "docs/models/magistral/vision.html#limitations",
+    "title": "Magistral Vision",
     "section": "Limitations",
     "text": "Limitations\n\nSample Packing is not supported for multi-modality training currently.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Ministral3",
-      "Ministral 3 Vision"
+      "Magistral",
+      "Magistral Vision"
     ]
   },
   {
-    "objectID": "docs/models/plano.html",
-    "href": "docs/models/plano.html",
-    "title": "Plano Orchestrator",
+    "objectID": "docs/models/llama-2.html",
+    "href": "docs/models/llama-2.html",
+    "title": "Llama 2",
     "section": "",
-    "text": "Plano-Orchestrator is a family of 4B and 30B-A3B routing and orchestration models designed for multi-agent systems. It analyzes user intent and conversation context to make precise routing decisions, excelling at multi-turn context understanding, multi-intent detection, and context-dependent routing.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "text": "This is an example of a llama-2 configuration for 7b and 13b. The yaml file contains configuration for the 7b variant, but you can just aswell use the same settings for 13b.\nThe 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.\nThe 13b variant will fit if you change these settings to these values:\ngradient_accumulation_steps: 2\nmicro_batch_size: 1\naccelerate launch -m axolotl.cli.train examples/llama-2/qlora.yml\nor\naccelerate launch -m axolotl.cli.train examples/llama-2/lora.yml\nTo launch a full finetuning with 16-bit precision:\naccelerate launch -m axolotl.cli.train examples/llama-2/fft_optimized.yml",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Plano Orchestrator"
+      "Llama 2"
     ]
   },
   {
-    "objectID": "docs/models/plano.html#getting-started",
-    "href": "docs/models/plano.html#getting-started",
-    "title": "Plano Orchestrator",
-    "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/plano/plano-4b-qlora.yaml\n\nThis config uses about 5.1 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀\n\nOrchestration Prompt\nPlano-Orchestrator uses a specific orchestration prompt format for routing/agent decisions. Please check the official model card for proper prompt formatting and the ORCHESTRATION_PROMPT template.\n\n\nTips\n\nTo use the larger Plano-Orchestrator-30B-A3B MoE model, simply change base_model: katanemo/Plano-Orchestrator-30B-A3B in the config and enable multi-GPU training if needed.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "objectID": "docs/models/smolvlm2.html",
+    "href": "docs/models/smolvlm2.html",
+    "title": "SmolVLM 2",
+    "section": "",
+    "text": "SmolVLM2 are a family of lightweight, open-source multimodal models from HuggingFace designed to analyze and understand video, image, and text content.\nThese models are built for efficiency, making them well-suited for on-device applications where computational resources are limited. Models are available in multiple sizes, including 2.2B, 500M, and 256M.\nThis guide shows how to fine-tune SmolVLM2 models with Axolotl.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Plano Orchestrator"
+      "SmolVLM 2"
     ]
   },
   {
-    "objectID": "docs/models/plano.html#optimization-guides",
-    "href": "docs/models/plano.html#optimization-guides",
-    "title": "Plano Orchestrator",
+    "objectID": "docs/models/smolvlm2.html#getting-started",
+    "href": "docs/models/smolvlm2.html#getting-started",
+    "title": "SmolVLM 2",
+    "section": "Getting Started",
+    "text": "Getting Started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\npip3 install packaging setuptools wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\nInstall an extra dependency:\npip3 install num2words==0.5.14\nRun the finetuning example:\n# LoRA SFT (1x48GB @ 6.8GiB)\naxolotl train examples/smolvlm2/smolvlm2-2B-lora.yaml",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "SmolVLM 2"
+    ]
+  },
+  {
+    "objectID": "docs/models/smolvlm2.html#tips",
+    "href": "docs/models/smolvlm2.html#tips",
+    "title": "SmolVLM 2",
+    "section": "TIPS",
+    "text": "TIPS\n\nDataset Format: For video finetuning, your dataset must be compatible with the multi-content Messages format. For more details, see our documentation on Multimodal Formats.\nDataset Loading: Read more on how to prepare and load your own datasets in our documentation.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "SmolVLM 2"
+    ]
+  },
+  {
+    "objectID": "docs/models/smolvlm2.html#optimization-guides",
+    "href": "docs/models/smolvlm2.html#optimization-guides",
+    "title": "SmolVLM 2",
     "section": "Optimization Guides",
     "text": "Optimization Guides\nPlease check the Optimizations doc.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Plano Orchestrator"
+      "SmolVLM 2"
     ]
   },
   {
-    "objectID": "docs/models/plano.html#related-resources",
-    "href": "docs/models/plano.html#related-resources",
-    "title": "Plano Orchestrator",
+    "objectID": "docs/models/smolvlm2.html#related-resources",
+    "href": "docs/models/smolvlm2.html#related-resources",
+    "title": "SmolVLM 2",
     "section": "Related Resources",
-    "text": "Related Resources\n\nPlano GitHub\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "text": "Related Resources\n\nSmolVLM2 Blog\nAxolotl Docs\nAxolotl GitHub\nAxolotl Discord",
     "crumbs": [
       "Getting Started",
       "Model Guides",
-      "Plano Orchestrator"
+      "SmolVLM 2"
     ]
   },
   {
-    "objectID": "docs/reward_modelling.html",
-    "href": "docs/reward_modelling.html",
-    "title": "Reward Modelling",
+    "objectID": "docs/models/mistral-small.html",
+    "href": "docs/models/mistral-small.html",
+    "title": "Mistral Small 3.1/3.2",
     "section": "",
-    "text": "Overview\nReward modelling is a technique used to train models to predict the reward or value of a given input. This is particularly useful in reinforcement learning scenarios where the model needs to evaluate the quality of its actions or predictions.\nWe support the reward modelling techniques supported by trl.\n\n\n(Outcome) Reward Models\nOutcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step).\nFor improved training stability, you can use the center_rewards_coefficient parameter to encourage mean-zero reward outputs (see TRL docs).\nbase_model: google/gemma-2-2b\nmodel_type: AutoModelForSequenceClassification\nnum_labels: 1\ntokenizer_type: AutoTokenizer\n\nreward_model: true\nchat_template: gemma\ndatasets:\n  - path: argilla/distilabel-intel-orca-dpo-pairs\n    type: bradley_terry.chat_template\n\nval_set_size: 0.1\neval_steps: 100\nBradley-Terry chat templates expect single-turn conversations in the following format:\n{\n    \"system\": \"...\", // optional\n    \"input\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nProcess Reward Models (PRM)\n\n\n\n\n\n\nTip\n\n\n\nCheck out our PRM blog.\n\n\nProcess reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.\nbase_model: Qwen/Qwen2.5-3B\nmodel_type: AutoModelForTokenClassification\nnum_labels: 2\n\nprocess_reward_model: true\ndatasets:\n  - path: trl-lib/math_shepherd\n    type: stepwise_supervised\n    split: train\n\nval_set_size: 0.1\neval_steps: 100\nPlease see stepwise_supervised for more details on the dataset format.",
+    "text": "This guide covers fine-tuning Mistral Small 3.1 and Mistral Small 3.2 with vision capabilities using Axolotl.",
     "crumbs": [
-      "How To Guides",
-      "Reward Modelling"
+      "Getting Started",
+      "Model Guides",
+      "Mistral Small 3.1/3.2"
     ]
   },
   {
-    "objectID": "docs/quantize.html",
-    "href": "docs/quantize.html",
-    "title": "Quantization with torchao",
+    "objectID": "docs/models/mistral-small.html#prerequisites",
+    "href": "docs/models/mistral-small.html#prerequisites",
+    "title": "Mistral Small 3.1/3.2",
+    "section": "Prerequisites",
+    "text": "Prerequisites\nBefore starting, ensure you have:\n\nInstalled Axolotl (see Installation docs)",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Mistral Small 3.1/3.2"
+    ]
+  },
+  {
+    "objectID": "docs/models/mistral-small.html#getting-started",
+    "href": "docs/models/mistral-small.html#getting-started",
+    "title": "Mistral Small 3.1/3.2",
+    "section": "Getting Started",
+    "text": "Getting Started\n\nInstall the required vision lib:\nbash     pip install 'mistral-common[opencv]==1.8.5'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml\n\nThis config uses about 29.4 GiB VRAM.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Mistral Small 3.1/3.2"
+    ]
+  },
+  {
+    "objectID": "docs/models/mistral-small.html#dataset-format",
+    "href": "docs/models/mistral-small.html#dataset-format",
+    "title": "Mistral Small 3.1/3.2",
+    "section": "Dataset Format",
+    "text": "Dataset Format\nThe vision model requires multi-modal dataset format as documented here.\nOne exception is that, passing \"image\": PIL.Image is not supported. MistralTokenizer only supports path, url, and base64 for now.\nExample:\n{\n    \"messages\": [\n        {\"role\": \"system\", \"content\": [{ \"type\": \"text\", \"text\": \"{SYSTEM_PROMPT}\"}]},\n        {\"role\": \"user\", \"content\": [\n            { \"type\": \"text\", \"text\": \"What's in this image?\"},\n            {\"type\": \"image\", \"path\": \"path/to/image.jpg\" }\n        ]},\n        {\"role\": \"assistant\", \"content\": [{ \"type\": \"text\", \"text\": \"...\" }]},\n    ],\n}",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Mistral Small 3.1/3.2"
+    ]
+  },
+  {
+    "objectID": "docs/models/mistral-small.html#limitations",
+    "href": "docs/models/mistral-small.html#limitations",
+    "title": "Mistral Small 3.1/3.2",
+    "section": "Limitations",
+    "text": "Limitations\n\nSample Packing is not supported for multi-modality training currently.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Mistral Small 3.1/3.2"
+    ]
+  },
+  {
+    "objectID": "docs/models/mistral.html",
+    "href": "docs/models/mistral.html",
+    "title": "Mistral 7B",
     "section": "",
-    "text": "Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the torchao library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).",
+    "text": "Mistral 7B is a language model with a total of 7.3 billion parameters, showcasing a notable performance across a variety of benchmarks.\nFine Tune:\naccelerate launch -m axolotl.cli.train examples/mistral/config.yml\n\nIf you run into CUDA OOM, use deepspeed with config zero2.json:\naccelerate launch -m axolotl.cli.train examples/mistral/config.yml --deepspeed deepspeed_configs/zero2.json",
     "crumbs": [
-      "How To Guides",
-      "Quantization with torchao"
+      "Getting Started",
+      "Model Guides",
+      "Mistral 7B"
     ]
   },
   {
-    "objectID": "docs/quantize.html#configuring-quantization-in-axolotl",
-    "href": "docs/quantize.html#configuring-quantization-in-axolotl",
-    "title": "Quantization with torchao",
-    "section": "Configuring Quantization in Axolotl",
-    "text": "Configuring Quantization in Axolotl\nQuantization is configured using the quantization key in your configuration file.\nbase_model: # The path to the model to quantize.\nquantization:\n  activation_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for activation quantization. Valid options are \"int4\", \"int8\", \"float8\"\n  weight_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for weight quantization. Valid options are \"int4\", \"fp8\", and \"nvfp4\".\n  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization\n  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.\n\noutput_dir:  # The path to the output directory.\nOnce quantization is complete, your quantized model will be saved in the {output_dir}/quantized directory.\nYou may also use the quantize command to quantize a model which has been trained with QAT - you can do this by using the existing QAT configuration file which\nyou used to train the model:\n# qat.yml\nqat:\n  activation_dtype: int8\n  weight_dtype: int4\n  group_size: 256\n\noutput_dir: # The path to the output directory used during training where the final checkpoint has been saved.\naxolotl quantize qat.yml\nThis ensures that an identical quantization configuration is used to quantize the model as was used to train it.\n\n\n\n\n\n\nNote\n\n\n\nIf you have configured pushing to hub with hub_model_id, your model hub name will have the quantization schema appended to it,\ne.g. axolotl-ai-cloud/qat-nvfp4-llama3B will become axolotl-ai-cloud/qat-nvfp4-llama3B-nvfp4w",
-    "crumbs": [
-      "How To Guides",
-      "Quantization with torchao"
-    ]
-  },
-  {
-    "objectID": "docs/fsdp_qlora.html",
-    "href": "docs/fsdp_qlora.html",
-    "title": "FSDP + QLoRA",
+    "objectID": "docs/models/hunyuan.html",
+    "href": "docs/models/hunyuan.html",
+    "title": "Hunyuan",
     "section": "",
-    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
+    "text": "Tencent released a family of opensource models called HunYuan with varying parameter scales of 0.5B, 1.8B, 4B, and 7B scale for both Pre-trained and Instruct variants. The models can be found at HuggingFace. This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
     "crumbs": [
-      "Advanced Features",
-      "FSDP + QLoRA"
+      "Getting Started",
+      "Model Guides",
+      "Hunyuan"
     ]
   },
   {
-    "objectID": "docs/fsdp_qlora.html#background",
-    "href": "docs/fsdp_qlora.html#background",
-    "title": "FSDP + QLoRA",
+    "objectID": "docs/models/hunyuan.html#getting-started",
+    "href": "docs/models/hunyuan.html#getting-started",
+    "title": "Hunyuan",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as HunYuan is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/hunyuan/hunyuan-v1-dense-qlora.yaml\nThis config uses about 4.7 GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nDataset\nHunYuan Instruct models can choose to enter a slow think or fast think pattern. For best performance on fine-tuning their Instruct models, your dataset should be adjusted to match their pattern.\n# fast think pattern\nmessages = [\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"/no_think What color is the sun?\" },\n    {\"role\": \"assistant\", \"content\": \"&lt;think&gt;\\n\\n&lt;/think&gt;\\n&lt;answer&gt;\\nThe sun is yellow.\\n&lt;/answer&gt;\"}\n]\n\n# slow think pattern\nmessages = [\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"/no_think What color is the sun?\" },\n    {\"role\": \"assistant\", \"content\": \"&lt;think&gt;\\nThe user is asking about the color of the sun. I need to ...\\n&lt;/think&gt;\\n&lt;answer&gt;\\nThe sun is yellow.\\n&lt;/answer&gt;\"}\n]\n\n\nTIPS\n\nFor inference, the official Tencent team recommends\n\n\n{\n  \"do_sample\": true,\n  \"top_k\": 20,\n  \"top_p\": 0.8,\n  \"repetition_penalty\": 1.05,\n  \"temperature\": 0.7\n}\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Hunyuan"
+    ]
+  },
+  {
+    "objectID": "docs/models/hunyuan.html#optimization-guides",
+    "href": "docs/models/hunyuan.html#optimization-guides",
+    "title": "Hunyuan",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Hunyuan"
+    ]
+  },
+  {
+    "objectID": "docs/models/hunyuan.html#related-resources",
+    "href": "docs/models/hunyuan.html#related-resources",
+    "title": "Hunyuan",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nTencent HunYuan Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Hunyuan"
+    ]
+  },
+  {
+    "objectID": "docs/models/orpheus.html",
+    "href": "docs/models/orpheus.html",
+    "title": "Orpheus",
     "section": "",
-    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
+    "text": "In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio.\nThe finetune.yml withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB.",
     "crumbs": [
-      "Advanced Features",
-      "FSDP + QLoRA"
+      "Getting Started",
+      "Model Guides",
+      "Orpheus"
     ]
   },
   {
-    "objectID": "docs/fsdp_qlora.html#usage",
-    "href": "docs/fsdp_qlora.html#usage",
-    "title": "FSDP + QLoRA",
-    "section": "Usage",
-    "text": "Usage\nTo enable QLoRA with FSDP, you need to perform the following steps:\n\n![Tip]\nSee the example config file in addition to reading these instructions.\n\n\nSet adapter: qlora in your axolotl config file.\nEnable FSDP in your axolotl config, as described here.\nUse one of the supported model types: llama, mistral or mixtral.",
+    "objectID": "docs/models/orpheus.html#dataset-pre-processing-for-pre-training",
+    "href": "docs/models/orpheus.html#dataset-pre-processing-for-pre-training",
+    "title": "Orpheus",
+    "section": "Dataset pre-processing for pre-training",
+    "text": "Dataset pre-processing for pre-training\nIf you are adding another voice in English, please jump ahead to finetuning pre-processing.\nFor this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer.\nUsing this code, it will download the SNAC model and add the correct tokens and upload the final dataset.\nimport torch\nfrom snac import SNAC\nfrom datasets import load_dataset\nfrom huggingface_hub import snapshot_download\nfrom datasets import load_dataset\nimport random\nimport torchaudio.transforms as T\nfrom transformers import AutoTokenizer\nimport os\n\nmy_original_dataset_name = \"&lt;huggingface-id-of-dataset-that-we-want-to-preprocess&gt;\"\nname_to_push_dataset_to = \"&lt;huggingface-id-of-where-to-save-dataset&gt;\"\n\ndsn = my_original_dataset_name\n\nsnapshot_download(\n    repo_id=dsn,\n    repo_type=\"dataset\",\n    revision=\"main\",\n    max_workers=64,\n)\n\n\nds = load_dataset(dsn, split=\"train\")\nds_sample_rate = ds[0][\"audio\"][\"sampling_rate\"]\n\nmodel = SNAC.from_pretrained(\"hubertsiuzdak/snac_24khz\")\nmodel = model.to(\"mps\")\n\ndef tokenise_audio(waveform):\n  waveform = torch.from_numpy(waveform).unsqueeze(0)\n  waveform = waveform.to(dtype=torch.float32)\n  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)\n  waveform = resample_transform(waveform)\n\n  waveform = waveform.unsqueeze(0).to(\"cuda\")\n\n  #generate the codes from snac\n  with torch.inference_mode():\n    codes = model.encode(waveform)\n\n  all_codes = []\n  for i in range(codes[0].shape[1]):\n    all_codes.append(codes[0][0][i].item()+128266)\n    all_codes.append(codes[1][0][2*i].item()+128266+4096)\n    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))\n    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))\n    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))\n    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))\n    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))\n\n\n  return all_codes\n\ndef add_codes(example):\n    # Always initialize codes_list to None\n    codes_list = None\n\n    try:\n        answer_audio = example.get(\"audio\")\n        # If there's a valid audio array, tokenise it\n        if answer_audio and \"array\" in answer_audio:\n            audio_array = answer_audio[\"array\"]\n            codes_list = tokenise_audio(audio_array)\n    except Exception as e:\n        print(f\"Skipping row due to error: {e}\")\n        # Keep codes_list as None if we fail\n    example[\"codes_list\"] = codes_list\n\n    return example\n\nds = ds.map(add_codes, remove_columns=[\"audio\"])\n\n#@title Load Tokenizer\ntokeniser_length = 128256\nstart_of_text = 128000\nend_of_text = 128009\n\nstart_of_speech = tokeniser_length + 1\nend_of_speech = tokeniser_length + 2\n\nstart_of_human = tokeniser_length + 3\nend_of_human = tokeniser_length + 4\n\nstart_of_ai = tokeniser_length + 5\nend_of_ai =  tokeniser_length + 6\npad_token = tokeniser_length + 7\n\naudio_tokens_start = tokeniser_length + 10\n\ntokenizer_name = \"canopylabs/orpheus-3b-0.1-pretrained\"\n\n\ntokenizer = AutoTokenizer.from_pretrained(tokenizer_name)\nnum_proc = os.cpu_count() - 2\n\nds = ds.filter(lambda x: x[\"codes_list\"] is not None)\nds = ds.filter(lambda x: len(x[\"codes_list\"]) &gt; 0)\n\n#@title Create Input Ids\ndef remove_duplicate_frames(example):\n    vals = example[\"codes_list\"]\n    if len(vals) % 7 != 0:\n        raise ValueError(\"Input list length must be divisible by 7\")\n\n    result = vals[:7]\n\n    removed_frames = 0\n\n    for i in range(7, len(vals), 7):\n        current_first = vals[i]\n        previous_first = result[-7]\n\n        if current_first != previous_first:\n            result.extend(vals[i:i+7])\n        else:\n            removed_frames += 1\n\n    example[\"codes_list\"] = result\n\n    return example\n\nds = ds.map(remove_duplicate_frames, num_proc=num_proc)\n\n\ndef create_input_ids(example):\n    text_ids = tokenizer.encode({example['text']},  add_special_tokens=True)\n    text_ids.append(end_of_text)\n    example[\"text_tokens\"] = text_ids\n    input_ids = (\n        [start_of_human]\n        + example[\"text_tokens\"]\n        + [end_of_human]\n        + [start_of_ai]\n        + [start_of_speech]\n        + example[\"codes_list\"]\n        + [end_of_speech]\n        + [end_of_ai]\n    )\n    example[\"input_ids\"] = input_ids\n    example[\"labels\"] = input_ids\n    example[\"attention_mask\"] = [1] * len(input_ids)\n\n    return example\n\nds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=[\"text\", \"codes_list\"])\n\n#@title Remove unnecessary columns\ncolumns_to_keep = [\"input_ids\", \"labels\", \"attention_mask\"]\ncolumns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]\n\nds = ds.remove_columns(columns_to_remove)\n\nds.push_to_hub(name_to_push_dataset_to)",
     "crumbs": [
-      "Advanced Features",
-      "FSDP + QLoRA"
+      "Getting Started",
+      "Model Guides",
+      "Orpheus"
     ]
   },
   {
-    "objectID": "docs/fsdp_qlora.html#enabling-swap-for-fsdp2",
-    "href": "docs/fsdp_qlora.html#enabling-swap-for-fsdp2",
-    "title": "FSDP + QLoRA",
-    "section": "Enabling Swap for FSDP2",
-    "text": "Enabling Swap for FSDP2\nIf available memory is insufficient even after FSDP’s CPU offloading, you can enable swap memory usage by setting cpu_offload_pin_memory: false alongside offload_params: true in FSDP config.\nThis disables memory pinning, allowing FSDP to use disk swap space as fallback. Disabling memory pinning itself incurs performance overhead, and actually having to use swap adds more, but it may enable training larger models that would otherwise cause OOM errors on resource constrained systems.",
+    "objectID": "docs/models/orpheus.html#finetune-pre-processing",
+    "href": "docs/models/orpheus.html#finetune-pre-processing",
+    "title": "Orpheus",
+    "section": "Finetune pre-processing",
+    "text": "Finetune pre-processing\nUse this code to add a new voice.\nimport torch\nfrom snac import SNAC\nfrom datasets import load_dataset\nfrom huggingface_hub import snapshot_download\nfrom datasets import load_dataset\nimport random\nimport torchaudio.transforms as T\nfrom transformers import AutoTokenizer\nimport os\n\nmy_original_dataset_name = \"&lt;huggingface-id-of-dataset-that-we-want-to-preprocess&gt;\"\nname_to_push_dataset_to = \"&lt;huggingface-id-of-where-to-save-dataset&gt;\"\n\ndsn = my_original_dataset_name\n\nsnapshot_download(\n    repo_id=dsn,\n    repo_type=\"dataset\",\n    revision=\"main\",\n    max_workers=64,\n)\n\n\nds = load_dataset(dsn, split=\"train\")\nds_sample_rate = ds[0][\"audio\"][\"sampling_rate\"]\n\nmodel = SNAC.from_pretrained(\"hubertsiuzdak/snac_24khz\")\nmodel = model.to(\"mps\")\n\ndef tokenise_audio(waveform):\n  waveform = torch.from_numpy(waveform).unsqueeze(0)\n  waveform = waveform.to(dtype=torch.float32)\n  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)\n  waveform = resample_transform(waveform)\n\n  waveform = waveform.unsqueeze(0).to(\"cuda\")\n\n  #generate the codes from snac\n  with torch.inference_mode():\n    codes = model.encode(waveform)\n\n  all_codes = []\n  for i in range(codes[0].shape[1]):\n    all_codes.append(codes[0][0][i].item()+128266)\n    all_codes.append(codes[1][0][2*i].item()+128266+4096)\n    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))\n    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))\n    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))\n    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))\n    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))\n\n\n  return all_codes\n\ndef add_codes(example):\n    # Always initialize codes_list to None\n    codes_list = None\n\n    try:\n        answer_audio = example.get(\"audio\")\n        # If there's a valid audio array, tokenise it\n        if answer_audio and \"array\" in answer_audio:\n            audio_array = answer_audio[\"array\"]\n            codes_list = tokenise_audio(audio_array)\n    except Exception as e:\n        print(f\"Skipping row due to error: {e}\")\n        # Keep codes_list as None if we fail\n    example[\"codes_list\"] = codes_list\n\n    return example\n\nds = ds.map(add_codes, remove_columns=[\"audio\"])\n\n#@title Load Tokenizer\ntokeniser_length = 128256\nstart_of_text = 128000\nend_of_text = 128009\n\nstart_of_speech = tokeniser_length + 1\nend_of_speech = tokeniser_length + 2\n\nstart_of_human = tokeniser_length + 3\nend_of_human = tokeniser_length + 4\n\nstart_of_ai = tokeniser_length + 5\nend_of_ai =  tokeniser_length + 6\npad_token = tokeniser_length + 7\n\naudio_tokens_start = tokeniser_length + 10\n\ntokenizer_name = \"canopylabs/orpheus-3b-0.1-pretrained\"\n\n\ntokenizer = AutoTokenizer.from_pretrained(tokenizer_name)\nnum_proc = os.cpu_count() - 2\n\nds = ds.filter(lambda x: x[\"codes_list\"] is not None)\nds = ds.filter(lambda x: len(x[\"codes_list\"]) &gt; 0)\n\n#@title Create Input Ids\ndef remove_duplicate_frames(example):\n    vals = example[\"codes_list\"]\n    if len(vals) % 7 != 0:\n        raise ValueError(\"Input list length must be divisible by 7\")\n\n    result = vals[:7]\n\n    removed_frames = 0\n\n    for i in range(7, len(vals), 7):\n        current_first = vals[i]\n        previous_first = result[-7]\n\n        if current_first != previous_first:\n            result.extend(vals[i:i+7])\n        else:\n            removed_frames += 1\n\n    example[\"codes_list\"] = result\n\n    return example\n\nds = ds.map(remove_duplicate_frames, num_proc=num_proc)\n\ntok_info = '''*** HERE you can modify the text prompt\ni.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:\nf\"{example[\"source\"]}:  {example[\"text\"]}\", as is passed.\n'''\nprint(tok_info)\n\ndef create_input_ids(example):\n    text_ids = tokenizer.encode(f\"{example['speaker_id']}: {example['text']}\",  add_special_tokens=True)\n    text_ids.append(end_of_text)\n    example[\"text_tokens\"] = text_ids\n    input_ids = (\n        [start_of_human]\n        + example[\"text_tokens\"]\n        + [end_of_human]\n        + [start_of_ai]\n        + [start_of_speech]\n        + example[\"codes_list\"]\n        + [end_of_speech]\n        + [end_of_ai]\n    )\n    example[\"input_ids\"] = input_ids\n    example[\"labels\"] = input_ids\n    example[\"attention_mask\"] = [1] * len(input_ids)\n\n    return example\n\nds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=[\"text\", \"codes_list\"])\n\n#@title Remove unnecessary columns\ncolumns_to_keep = [\"input_ids\", \"labels\", \"attention_mask\"]\ncolumns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]\n\nds = ds.remove_columns(columns_to_remove)\n\nds.push_to_hub(name_to_push_dataset_to)",
     "crumbs": [
-      "Advanced Features",
-      "FSDP + QLoRA"
+      "Getting Started",
+      "Model Guides",
+      "Orpheus"
     ]
   },
   {
-    "objectID": "docs/fsdp_qlora.html#example-config",
-    "href": "docs/fsdp_qlora.html#example-config",
-    "title": "FSDP + QLoRA",
-    "section": "Example Config",
-    "text": "Example Config\nexamples/llama-2/qlora-fsdp.yml contains an example of how to enable QLoRA + FSDP in axolotl.",
+    "objectID": "docs/models/orpheus.html#training",
+    "href": "docs/models/orpheus.html#training",
+    "title": "Orpheus",
+    "section": "Training",
+    "text": "Training\nAfter preprocessing is done, fill out the blanks in finetune.yml and simply run axolotl train finetune.yml",
     "crumbs": [
-      "Advanced Features",
-      "FSDP + QLoRA"
+      "Getting Started",
+      "Model Guides",
+      "Orpheus"
     ]
   },
   {
-    "objectID": "docs/fsdp_qlora.html#references",
-    "href": "docs/fsdp_qlora.html#references",
-    "title": "FSDP + QLoRA",
-    "section": "References",
-    "text": "References\n\nPR #1378 enabling QLoRA in FSDP in Axolotl.\nBlog Post from the Answer.AI team describing the work that enabled QLoRA in FSDP.\nRelated HuggingFace PRs Enabling FDSP + QLoRA:\n\nAccelerate PR#2544\nTransformers PR#29587\nTRL PR#1416\nPEFT PR#1550",
+    "objectID": "docs/models/orpheus.html#inference",
+    "href": "docs/models/orpheus.html#inference",
+    "title": "Orpheus",
+    "section": "Inference",
+    "text": "Inference\nFor inference, please refer to the original orpheus github.",
     "crumbs": [
-      "Advanced Features",
-      "FSDP + QLoRA"
+      "Getting Started",
+      "Model Guides",
+      "Orpheus"
     ]
   },
   {
-    "objectID": "docs/fsdp_qlora.html#footnotes",
-    "href": "docs/fsdp_qlora.html#footnotes",
-    "title": "FSDP + QLoRA",
-    "section": "Footnotes",
-    "text": "Footnotes\n\n\nThis was enabled by this work from the Answer.AI team.↩︎",
+    "objectID": "docs/models/qwen3.html",
+    "href": "docs/models/qwen3.html",
+    "title": "Qwen 3",
+    "section": "",
+    "text": "Qwen3 are a family of open source models trained by Alibaba.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Qwen 3"
+    ]
+  },
+  {
+    "objectID": "docs/models/qwen3.html#getting-started",
+    "href": "docs/models/qwen3.html#getting-started",
+    "title": "Qwen 3",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/qwen3/32b-qlora.yaml\n\nLet us know how it goes. Happy finetuning! 🚀\n\nChat template masking a few tokens off\nIf you notice that the chat_template masking for assistant prompts are off by a few tokens, please ensure that you are adding the below to the yaml.\nchat_template: qwen3\n\n\nTIPS\n\nFor inference, please check the official model card as it depends on your reasoning mode.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Qwen 3"
+    ]
+  },
+  {
+    "objectID": "docs/models/qwen3.html#optimization-guides",
+    "href": "docs/models/qwen3.html#optimization-guides",
+    "title": "Qwen 3",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\nPlease check the Optimizations doc.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Qwen 3"
+    ]
+  },
+  {
+    "objectID": "docs/models/qwen3.html#related-resources",
+    "href": "docs/models/qwen3.html#related-resources",
+    "title": "Qwen 3",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nQwen3 Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Qwen 3"
+    ]
+  },
+  {
+    "objectID": "docs/models/seed-oss.html",
+    "href": "docs/models/seed-oss.html",
+    "title": "Seed-OSS",
+    "section": "",
+    "text": "Seed-OSS are a series of 36B parameter open source models trained by ByteDance’s Seed Team.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Seed-OSS"
+    ]
+  },
+  {
+    "objectID": "docs/models/seed-oss.html#getting-started",
+    "href": "docs/models/seed-oss.html#getting-started",
+    "title": "Seed-OSS",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\npip3 install packaging setuptools wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\n# Install Cut Cross Entropy\npython scripts/cutcrossentropy_install.py | sh\nRun the finetuning example:\n\naxolotl train examples/seed-oss/seed-oss-36b-qlora.yaml\nThis config uses about 27.7 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official Seed Team recommends top_p=0.95 and temperature=1.1.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Seed-OSS"
+    ]
+  },
+  {
+    "objectID": "docs/models/seed-oss.html#optimization-guides",
+    "href": "docs/models/seed-oss.html#optimization-guides",
+    "title": "Seed-OSS",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\nPlease check the Optimizations doc.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Seed-OSS"
+    ]
+  },
+  {
+    "objectID": "docs/models/seed-oss.html#related-resources",
+    "href": "docs/models/seed-oss.html#related-resources",
+    "title": "Seed-OSS",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nByteDance Seed Website\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Seed-OSS"
+    ]
+  },
+  {
+    "objectID": "docs/models/granite4.html",
+    "href": "docs/models/granite4.html",
+    "title": "Granite 4",
+    "section": "",
+    "text": "Granite 4.0 are a family of open source models trained by IBM Research.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Granite 4"
+    ]
+  },
+  {
+    "objectID": "docs/models/granite4.html#getting-started",
+    "href": "docs/models/granite4.html#getting-started",
+    "title": "Granite 4",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as Granite4 is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.7.1 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/granite4/granite-4.0-tiny-fft.yaml\nThis config uses about 40.8GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\n\n\n\nLimitation\nAdapter finetuning does not work at the moment. It would error with\nRuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x3072 and 1x1179648)\nIn addition, if adapter training works, lora_target_linear: true will not work due to:\nValueError: Target module GraniteMoeHybridParallelExperts() is not supported.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Granite 4"
+    ]
+  },
+  {
+    "objectID": "docs/models/granite4.html#optimization-guides",
+    "href": "docs/models/granite4.html#optimization-guides",
+    "title": "Granite 4",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Granite 4"
+    ]
+  },
+  {
+    "objectID": "docs/models/granite4.html#related-resources",
+    "href": "docs/models/granite4.html#related-resources",
+    "title": "Granite 4",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nGranite Docs\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Granite 4"
+    ]
+  },
+  {
+    "objectID": "docs/models/olmo3.html",
+    "href": "docs/models/olmo3.html",
+    "title": "OLMo 3",
+    "section": "",
+    "text": "Olmo 3 are a family of 7B and 32B models open source models trained by The Allen Institute for Artificial Intelligence.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "OLMo 3"
+    ]
+  },
+  {
+    "objectID": "docs/models/olmo3.html#getting-started",
+    "href": "docs/models/olmo3.html#getting-started",
+    "title": "OLMo 3",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/olmo3/olmo3-7b-qlora.yaml\n\nThis uses about 11.3 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nThe example config can be re-used for Olmo and Olmo 2.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "OLMo 3"
+    ]
+  },
+  {
+    "objectID": "docs/models/olmo3.html#optimization-guides",
+    "href": "docs/models/olmo3.html#optimization-guides",
+    "title": "OLMo 3",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\nPlease check the Optimizations doc.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "OLMo 3"
+    ]
+  },
+  {
+    "objectID": "docs/models/olmo3.html#related-resources",
+    "href": "docs/models/olmo3.html#related-resources",
+    "title": "OLMo 3",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nOlmo 3 Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "OLMo 3"
+    ]
+  },
+  {
+    "objectID": "docs/models/phi.html",
+    "href": "docs/models/phi.html",
+    "title": "Phi",
+    "section": "",
+    "text": "Due to some nuances with the phi code, please use deepspeed when training phi for full finetune.\naccelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed_configs/zero1.json\n\n# OR\n\npython -m axolotl.cli.train examples/phi/phi-qlora.yml",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Phi"
+    ]
+  },
+  {
+    "objectID": "docs/models/gpt-oss.html",
+    "href": "docs/models/gpt-oss.html",
+    "title": "GPT-OSS",
+    "section": "",
+    "text": "GPT-OSS are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B.\nIn October 2025, OpenAI released safeguard models built upon GPT-OSS called GPT-OSS-Safeguard. They use the same architecture, so the same examples below can be re-used.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "GPT-OSS"
+    ]
+  },
+  {
+    "objectID": "docs/models/gpt-oss.html#getting-started",
+    "href": "docs/models/gpt-oss.html#getting-started",
+    "title": "GPT-OSS",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nChoose one of the following configs below for training the 20B model. (for 120B, see below)\n\n# LoRA SFT linear layers (1x48GB @ ~44GiB)\naxolotl train examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml\n\n# FFT SFT with offloading (2x24GB @ ~21GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml\n\n# FFT SFT (8x48GB @ ~36GiB/GPU or 4x80GB @ ~46GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml\nNote: Memory usage taken from device_mem_reserved(gib) from logs.\n\nTraining 120B\nOn 8xH100s, make sure you have ~3TB of free disk space. With each checkpoint clocking in at ~720GB, along with the base\nmodel, and final model output, you may need at least 3TB of free disk space to keep at least 2 checkpoints.\n# FFT SFT with offloading (8x80GB @ ~49GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml\nTo simplify fine-tuning across 2 nodes × 8x H100 (80GB) GPUs, we’ve partnered with Baseten to showcase multi-node\ntraining of the 120B model using Baseten Truss. You can read more about this recipe on\nBaseten’s blog. The recipe can\nbe found on their\nGitHub.\nERRATA: Transformers saves the model Architecture prefixed with FSDP which needs to be manually renamed in config.json.\nSee https://github.com/huggingface/transformers/pull/40207 for the status of this issue.\nsed -i 's/FSDPGptOssForCausalLM/GptOssForCausalLM/g' ./outputs/gpt-oss-out/config.json\nWhen using SHARDED_STATE_DICT with FSDP, the final checkpoint should automatically merge the sharded weights to your\nconfigured output_dir. However, if that step fails due to a disk space error, you can take an additional step to\nmerge the sharded weights. This step will automatically determine the last checkpoint directory and merge the sharded\nweights to {output_dir}/merged.\naxolotl merge-sharded-fsdp-weights examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml\nmv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/\n\n\nHow to set reasoning_effort in template?\nThe harmony template has a feature to set the reasoning_effort during prompt building. The default is medium. If you would like to adjust this, you can add the following to your config:\nchat_template_kwargs:\n  reasoning_effort: \"high\"  # low | medium | high\nCurrently, this applies globally. There is no method to apply per sample yet. If you are interested in adding this, please feel free to create an Issue to discuss.\n\n\nInferencing your fine-tuned model\n\nvLLM\nGPT-OSS support in vLLM does not exist in a stable release yet. See https://x.com/MaziyarPanahi/status/1955741905515323425\nfor more information about using a special vllm-openai docker image for inferencing with vLLM.\nOptionally, vLLM can be installed from nightly:\npip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly\nand the vLLM server can be started with the following command (modify --tensor-parallel-size 8 to match your environment):\nvllm serve ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-20b --host 0.0.0.0 --port 8888  --tensor-parallel-size 8\n\n\nSGLang\nSGLang has 0-day support in main, see https://github.com/sgl-project/sglang/issues/8833 for infomation on installing\nSGLang from source. Once you’ve installed SGLang, run the following command to launch a SGLang server:\npython3 -m sglang.launch_server --model ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-120b --host 0.0.0.0 --port 8888 --tp 8\n\n\n\nTool use\nGPT-OSS has a comprehensive tool understanding. Axolotl supports tool calling datasets for Supervised Fine-tuning.\nHere is an example dataset config:\ndatasets:\n  - path: Nanobit/text-tools-2k-test\n    type: chat_template\nSee Nanobit/text-tools-2k-test for the sample dataset.\nRefer to our docs for more info.\n\n\nThinking and chat_template masking conflict\nOpenAI’s Harmony template hides thinking in all non-final turns, which conflicts with Axolotl’s chat_template masking.\nIf your dataset has thinking content mid-turn, there are two paths we recommend:\n\nTrain only on the last turn. This can be accomplished via chat_template’s train on last doc.\nAdjust your dataset to only have thinking content in the last turn.\n\n\n\nTIPS\n\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "GPT-OSS"
+    ]
+  },
+  {
+    "objectID": "docs/models/gpt-oss.html#optimization-guides",
+    "href": "docs/models/gpt-oss.html#optimization-guides",
+    "title": "GPT-OSS",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "GPT-OSS"
+    ]
+  },
+  {
+    "objectID": "docs/models/gpt-oss.html#related-resources",
+    "href": "docs/models/gpt-oss.html#related-resources",
+    "title": "GPT-OSS",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nGPT-OSS Blog\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "GPT-OSS"
+    ]
+  },
+  {
+    "objectID": "docs/models/ministral3/think.html",
+    "href": "docs/models/ministral3/think.html",
+    "title": "Ministral 3 Thinking",
+    "section": "",
+    "text": "This guide covers fine-tuning Ministral3 2512 with thinking capabilities using Axolotl. The thinking model enables explicit Chain-of-Thought reasoning with separate thinking and response sections.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Ministral3",
+      "Ministral 3 Thinking"
+    ]
+  },
+  {
+    "objectID": "docs/models/ministral3/think.html#prerequisites",
+    "href": "docs/models/ministral3/think.html#prerequisites",
+    "title": "Ministral 3 Thinking",
+    "section": "Prerequisites",
+    "text": "Prerequisites\nBefore starting, ensure you have:\n\nInstalled Axolotl (see main README)",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Ministral3",
+      "Ministral 3 Thinking"
+    ]
+  },
+  {
+    "objectID": "docs/models/ministral3/think.html#getting-started",
+    "href": "docs/models/ministral3/think.html#getting-started",
+    "title": "Ministral 3 Thinking",
+    "section": "Getting Started",
+    "text": "Getting Started\nRun the thinking model fine-tuning:\naxolotl train examples/ministral3/think/ministral3-3b-think-qlora.yaml\nThis config uses about 4.76 GiB VRAM.\n\nTips\n\nDataset uses multi-content format with type: thinking support. See Dataset Format below.\nYou cannot mix content: str and content: list[dict], otherwise, dataset loading will fail. Keep it consistent.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Ministral3",
+      "Ministral 3 Thinking"
+    ]
+  },
+  {
+    "objectID": "docs/models/ministral3/think.html#dataset-format",
+    "href": "docs/models/ministral3/think.html#dataset-format",
+    "title": "Ministral 3 Thinking",
+    "section": "Dataset Format",
+    "text": "Dataset Format\nThe thinking model requires the multi-content dataset format with support for an extra role: thinking within system and assistant messages.\nExample format:\n{\n    \"messages\": [\n        {\n            \"role\": \"system\",\n            \"content\": [\n                { \"type\": \"text\", \"text\": \"{SYSTEM_PROMPT}\"}\n            ]\n        },\n        {\n            \"role\": \"user\",\n            \"content\": [\n                { \"type\": \"text\", \"text\": \"Solve this step by step: What is 15% of 240?\"}\n            ]\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": [\n                {\n                    \"type\": \"thinking\",\n                    \"thinking\": \"I need to calculate 15% of 240. First, I'll convert 15% to decimal: 0.15. Then multiply: 0.15 × 240 = 36.\"\n                },\n                {\n                    \"type\": \"text\",\n                    \"text\": \"To find 15% of 240, I'll multiply 240 by 0.15:\\n\\n240 × 0.15 = 36\\n\\nTherefore, 15% of 240 is 36.\"\n                }\n            ]\n        }\n    ]\n}\n\nAdvanced Options\nThe thinking section supports an optional closed parameter:\n{\n    \"type\": \"thinking\",\n    \"thinking\": \"Internal reasoning here...\",\n    \"closed\": true  // Default: true, controls adding the closing [/THINK] tag\n}",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Ministral3",
+      "Ministral 3 Thinking"
+    ]
+  },
+  {
+    "objectID": "docs/models/apertus.html",
+    "href": "docs/models/apertus.html",
+    "title": "Apertus",
+    "section": "",
+    "text": "Apertus is a family of opensource models trained by Swiss-ai.\nThis guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Apertus"
+    ]
+  },
+  {
+    "objectID": "docs/models/apertus.html#getting-started",
+    "href": "docs/models/apertus.html#getting-started",
+    "title": "Apertus",
+    "section": "Getting started",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as Apertus is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\n(Optional, highly recommended) Install XIELU CUDA\n\n## Recommended for reduced VRAM and faster speeds\n\n# Point to CUDA toolkit directory\n# For those using our Docker image, use the below path.\nexport CUDA_HOME=/usr/local/cuda\n\npip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps\nFor any installation errors, see XIELU Installation Issues\n\nRun the finetuning example:\n\naxolotl train examples/apertus/apertus-8b-qlora.yaml\nThis config uses about 8.7 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTips\n\nFor inference, the official Apertus team recommends top_p=0.9 and temperature=0.8.\nYou can instead use full paremter fine-tuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\n\n\n\nXIELU Installation Issues\n\nModuleNotFoundError: No module named 'torch'\nPlease check these one by one:\n- Running in correct environment\n- Env has PyTorch installed\n- CUDA toolkit is at CUDA_HOME\nIf those didn’t help, please try the below solutions:\n\nPass env for CMAKE and try install again:\nPython_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps\nGit clone the repo and manually hardcode python path:\ngit clone https://github.com/nickjbrowning/XIELU\ncd xielu\ngit checkout 59d6031\n\ncd xielu\nnano CMakeLists.txt  # or vi depending on your preference\nexecute_process(\n-    COMMAND ${Python_EXECUTABLE} -c \"import torch.utils; print(torch.utils.cmake_prefix_path)\"\n+    COMMAND /root/miniconda3/envs/py3.11/bin/python -c \"import torch.utils; print(torch.utils.cmake_prefix_path)\"\n    RESULT_VARIABLE TORCH_CMAKE_PATH_RESULT\n    OUTPUT_VARIABLE TORCH_CMAKE_PATH_OUTPUT\n    ERROR_VARIABLE TORCH_CMAKE_PATH_ERROR\n)\npip3 install . --no-build-isolation --no-deps",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Apertus"
+    ]
+  },
+  {
+    "objectID": "docs/models/apertus.html#optimization-guides",
+    "href": "docs/models/apertus.html#optimization-guides",
+    "title": "Apertus",
+    "section": "Optimization Guides",
+    "text": "Optimization Guides\n\nMulti-GPU Training\nMulti-Node Training\nLoRA Optimizations",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Apertus"
+    ]
+  },
+  {
+    "objectID": "docs/models/apertus.html#related-resources",
+    "href": "docs/models/apertus.html#related-resources",
+    "title": "Apertus",
+    "section": "Related Resources",
+    "text": "Related Resources\n\nApertus Tech Report\nAxolotl Docs\nAxolotl Website\nAxolotl GitHub\nAxolotl Discord",
+    "crumbs": [
+      "Getting Started",
+      "Model Guides",
+      "Apertus"
+    ]
+  },
+  {
+    "objectID": "docs/optimizers.html",
+    "href": "docs/optimizers.html",
+    "title": "Optimizers",
+    "section": "",
+    "text": "Axolotl supports all optimizers supported by transformers OptimizerNames\nHere is a list of optimizers supported by transformers as of v4.54.0:\n\nadamw_torch\nadamw_torch_fused\nadamw_torch_xla\nadamw_torch_npu_fused\nadamw_apex_fused\nadafactor\nadamw_anyprecision\nadamw_torch_4bit\nadamw_torch_8bit\nademamix\nsgd\nadagrad\nadamw_bnb_8bit\nadamw_8bit # alias for adamw_bnb_8bit\nademamix_8bit\nlion_8bit\nlion_32bit\npaged_adamw_32bit\npaged_adamw_8bit\npaged_ademamix_32bit\npaged_ademamix_8bit\npaged_lion_32bit\npaged_lion_8bit\nrmsprop\nrmsprop_bnb\nrmsprop_bnb_8bit\nrmsprop_bnb_32bit\ngalore_adamw\ngalore_adamw_8bit\ngalore_adafactor\ngalore_adamw_layerwise\ngalore_adamw_8bit_layerwise\ngalore_adafactor_layerwise\nlomo\nadalomo\ngrokadamw\nschedule_free_radam\nschedule_free_adamw\nschedule_free_sgd\napollo_adamw\napollo_adamw_layerwise\nstable_adamw",
+    "crumbs": [
+      "Core Concepts",
+      "Optimizers"
+    ]
+  },
+  {
+    "objectID": "docs/optimizers.html#overview",
+    "href": "docs/optimizers.html#overview",
+    "title": "Optimizers",
+    "section": "",
+    "text": "Axolotl supports all optimizers supported by transformers OptimizerNames\nHere is a list of optimizers supported by transformers as of v4.54.0:\n\nadamw_torch\nadamw_torch_fused\nadamw_torch_xla\nadamw_torch_npu_fused\nadamw_apex_fused\nadafactor\nadamw_anyprecision\nadamw_torch_4bit\nadamw_torch_8bit\nademamix\nsgd\nadagrad\nadamw_bnb_8bit\nadamw_8bit # alias for adamw_bnb_8bit\nademamix_8bit\nlion_8bit\nlion_32bit\npaged_adamw_32bit\npaged_adamw_8bit\npaged_ademamix_32bit\npaged_ademamix_8bit\npaged_lion_32bit\npaged_lion_8bit\nrmsprop\nrmsprop_bnb\nrmsprop_bnb_8bit\nrmsprop_bnb_32bit\ngalore_adamw\ngalore_adamw_8bit\ngalore_adafactor\ngalore_adamw_layerwise\ngalore_adamw_8bit_layerwise\ngalore_adafactor_layerwise\nlomo\nadalomo\ngrokadamw\nschedule_free_radam\nschedule_free_adamw\nschedule_free_sgd\napollo_adamw\napollo_adamw_layerwise\nstable_adamw",
+    "crumbs": [
+      "Core Concepts",
+      "Optimizers"
+    ]
+  },
+  {
+    "objectID": "docs/optimizers.html#custom-optimizers",
+    "href": "docs/optimizers.html#custom-optimizers",
+    "title": "Optimizers",
+    "section": "Custom Optimizers",
+    "text": "Custom Optimizers\nEnable custom optimizers by passing a string to the optimizer argument. Each optimizer will receive beta and epsilon args, however, some may accept additional args which are detailed below.\n\noptimi_adamw\noptimizer: optimi_adamw\n\n\nao_adamw_4bit\nDeprecated: Please use adamw_torch_4bit.\n\n\nao_adamw_8bit\nDeprecated: Please use adamw_torch_8bit.\n\n\nao_adamw_fp8\noptimizer: ao_adamw_fp8\n\n\nadopt_adamw\nGitHub: https://github.com/iShohei220/adopt\nPaper: https://arxiv.org/abs/2411.02853\noptimizer: adopt_adamw\n\n\ncame_pytorch\nGitHub: https://github.com/yangluo7/CAME/tree/master\nPaper: https://arxiv.org/abs/2307.02047\noptimizer: came_pytorch\n\n# optional args (defaults below)\nadam_beta1: 0.9\nadam_beta2: 0.999\nadam_beta3: 0.9999\nadam_epsilon: 1e-30\nadam_epsilon2: 1e-16\n\n\nmuon\nBlog: https://kellerjordan.github.io/posts/muon/\nPaper: https://arxiv.org/abs/2502.16982v1\noptimizer: muon\n\n\ndion\nMicrosoft’s Dion (DIstributed OrthoNormalization) optimizer is a scalable and communication-efficient\northonormalizing optimizer that uses low-rank approximations to reduce gradient communication.\nGitHub: https://github.com/microsoft/dion\nPaper: https://arxiv.org/pdf/2504.05295\nNote: Implementation written for PyTorch 2.7+ for DTensor\noptimizer: dion\ndion_lr: 0.01\ndion_momentum: 0.95\nlr: 0.00001  # learning rate for embeddings and parameters that fallback to AdamW",
+    "crumbs": [
+      "Core Concepts",
+      "Optimizers"
+    ]
+  },
+  {
+    "objectID": "docs/torchao.html",
+    "href": "docs/torchao.html",
+    "title": "PyTorch ao",
+    "section": "",
+    "text": "To use experimental optimizers (AdamWFp8, AdamW4bit, AdamW8bit) from Pytorch Ao, please install the package as shown below.\n\n\n\n\n\n\nTip\n\n\n\nSome experimental optimizers are already present in regular Pytorch, so please re-check if you actually need this package!\n\n\n\nInstallation\nStable Release from the PyTorch index\npip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124\nNightly release\npip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124",
     "crumbs": [
       "Advanced Features",
-      "FSDP + QLoRA"
+      "PyTorch ao"
+    ]
+  },
+  {
+    "objectID": "docs/attention.html",
+    "href": "docs/attention.html",
+    "title": "Attention",
+    "section": "",
+    "text": "This is the default built-in attention in PyTorch.\nsdp_attention: true\nFor more details: PyTorch docs",
+    "crumbs": [
+      "Core Concepts",
+      "Attention"
+    ]
+  },
+  {
+    "objectID": "docs/attention.html#sdp-attention",
+    "href": "docs/attention.html#sdp-attention",
+    "title": "Attention",
+    "section": "",
+    "text": "This is the default built-in attention in PyTorch.\nsdp_attention: true\nFor more details: PyTorch docs",
+    "crumbs": [
+      "Core Concepts",
+      "Attention"
+    ]
+  },
+  {
+    "objectID": "docs/attention.html#flash-attention-2",
+    "href": "docs/attention.html#flash-attention-2",
+    "title": "Attention",
+    "section": "Flash Attention 2",
+    "text": "Flash Attention 2\nUses efficient kernels to compute attention.\nflash_attention: true\nFor more details: Flash Attention\n\nNvidia\nRequirements: Ampere, Ada, or Hopper GPUs\nNote: For Turing GPUs or lower, please use other attention methods.\npip install flash-attn --no-build-isolation\n\n\n\n\n\n\nTip\n\n\n\nIf you get undefined symbol while training, ensure you installed PyTorch prior to Axolotl. Alternatively, try reinstall or downgrade a version.\n\n\n\nFlash Attention 3\nRequirements: Hopper only and CUDA 12.8 (recommended)\ngit clone https://github.com/Dao-AILab/flash-attention.git\ncd flash-attention/hopper\n\npython setup.py install\n\n\n\nAMD\nRequirements: ROCm 6.0 and above.\nSee Flash Attention AMD docs.",
+    "crumbs": [
+      "Core Concepts",
+      "Attention"
+    ]
+  },
+  {
+    "objectID": "docs/attention.html#flex-attention",
+    "href": "docs/attention.html#flex-attention",
+    "title": "Attention",
+    "section": "Flex Attention",
+    "text": "Flex Attention\nA flexible PyTorch API for attention used in combination with torch.compile.\nflex_attention: true\n\n# recommended\ntorch_compile: true\n\n\n\n\n\n\nNote\n\n\n\nWe recommend using latest stable version of PyTorch for best performance.\n\n\nFor more details: PyTorch docs",
+    "crumbs": [
+      "Core Concepts",
+      "Attention"
+    ]
+  },
+  {
+    "objectID": "docs/attention.html#sageattention",
+    "href": "docs/attention.html#sageattention",
+    "title": "Attention",
+    "section": "SageAttention",
+    "text": "SageAttention\nAttention kernels with QK Int8 and PV FP16 accumulator.\nsage_attention: true\nRequirements: Ampere, Ada, or Hopper GPUs\npip install sageattention==2.2.0 --no-build-isolation\n\n\n\n\n\n\nWarning\n\n\n\nOnly LoRA/QLoRA recommended at the moment. We found loss drop to 0 for full finetuning. See GitHub Issue.\n\n\nFor more details: Sage Attention\n\n\n\n\n\n\nNote\n\n\n\nWe do not support SageAttention 3 at the moment. If you are interested on adding this or improving SageAttention implementation, please make an Issue.",
+    "crumbs": [
+      "Core Concepts",
+      "Attention"
+    ]
+  },
+  {
+    "objectID": "docs/attention.html#xformers",
+    "href": "docs/attention.html#xformers",
+    "title": "Attention",
+    "section": "xFormers",
+    "text": "xFormers\nxformers_attention: true\n\n\n\n\n\n\nTip\n\n\n\nWe recommend using with Turing GPUs or below (such as on Colab).\n\n\nFor more details: xFormers",
+    "crumbs": [
+      "Core Concepts",
+      "Attention"
+    ]
+  },
+  {
+    "objectID": "docs/attention.html#shifted-sparse-attention",
+    "href": "docs/attention.html#shifted-sparse-attention",
+    "title": "Attention",
+    "section": "Shifted Sparse Attention",
+    "text": "Shifted Sparse Attention\n\n\n\n\n\n\nWarning\n\n\n\nWe plan to deprecate this! If you use this feature, we recommend switching to methods above.\n\n\nRequirements: LLaMA model architecture\nflash_attention: true\ns2_attention: true\n\n\n\n\n\n\nTip\n\n\n\nNo sample packing support!",
+    "crumbs": [
+      "Core Concepts",
+      "Attention"
     ]
   },
   {
diff --git a/sitemap.xml b/sitemap.xml
index 5ddbc011d..035675d7a 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,942 +2,946 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
-    <lastmod>2026-01-31T02:29:03.124Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.600Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/mac.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/cli.html</loc>
-    <lastmod>2026-01-31T02:29:03.093Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.570Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/mixed_precision.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/installation.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/sequence_parallelism.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/optimizations.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/gradient_checkpointing.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/streaming.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/lora_optims.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/amd_hpc.html</loc>
-    <lastmod>2026-01-31T02:29:03.093Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.570Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/debugging.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/conversation.html</loc>
-    <lastmod>2026-01-31T02:29:03.093Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/index.html</loc>
-    <lastmod>2026-01-31T02:29:03.093Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/config-reference.html</loc>
-    <lastmod>2026-01-31T02:32:57.247Z</lastmod>
+    <lastmod>2026-02-10T11:10:37.452Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multimodal.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/ray-integration.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/faq.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset_preprocessing.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/optimizers.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/apertus.html</loc>
-    <lastmod>2026-01-31T02:32:57.694Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/ministral3/think.html</loc>
-    <lastmod>2026-01-31T02:32:57.688Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/gpt-oss.html</loc>
-    <lastmod>2026-01-31T02:32:57.695Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/phi.html</loc>
-    <lastmod>2026-01-31T02:32:57.695Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/olmo3.html</loc>
-    <lastmod>2026-01-31T02:32:57.687Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/granite4.html</loc>
-    <lastmod>2026-01-31T02:32:57.696Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/seed-oss.html</loc>
-    <lastmod>2026-01-31T02:32:57.695Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/qwen3.html</loc>
-    <lastmod>2026-01-31T02:32:57.693Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/orpheus.html</loc>
-    <lastmod>2026-01-31T02:32:57.697Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/hunyuan.html</loc>
-    <lastmod>2026-01-31T02:32:57.697Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/mistral.html</loc>
-    <lastmod>2026-01-31T02:32:57.692Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/mistral-small.html</loc>
-    <lastmod>2026-01-31T02:32:57.691Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/smolvlm2.html</loc>
-    <lastmod>2026-01-31T02:32:57.696Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/llama-2.html</loc>
-    <lastmod>2026-01-31T02:32:57.693Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/magistral/vision.html</loc>
-    <lastmod>2026-01-31T02:32:57.690Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/jamba.html</loc>
-    <lastmod>2026-01-31T02:32:57.697Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/mimo.html</loc>
-    <lastmod>2026-01-31T02:32:57.686Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
-    <lastmod>2026-01-31T02:32:39.382Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.sweeps.html</loc>
-    <lastmod>2026-01-31T02:32:38.579Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
-    <lastmod>2026-01-31T02:32:38.174Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
-    <lastmod>2026-01-31T02:32:39.300Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.tokenizer.html</loc>
-    <lastmod>2026-01-31T02:32:38.696Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html</loc>
-    <lastmod>2026-01-31T02:32:39.174Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html</loc>
-    <lastmod>2026-01-31T02:32:39.259Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
-    <lastmod>2026-01-31T02:32:39.433Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html</loc>
-    <lastmod>2026-01-31T02:32:39.240Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.patch_manager.html</loc>
-    <lastmod>2026-01-31T02:32:38.723Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.liger.args.html</loc>
-    <lastmod>2026-01-31T02:32:39.756Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
-    <lastmod>2026-01-31T02:32:39.522Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
-    <lastmod>2026-01-31T02:32:38.919Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
-    <lastmod>2026-01-31T02:32:38.837Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
-    <lastmod>2026-01-31T02:32:38.535Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html</loc>
-    <lastmod>2026-01-31T02:32:39.291Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
-    <lastmod>2026-01-31T02:32:39.144Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
-    <lastmod>2026-01-31T02:32:39.741Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
-    <lastmod>2026-01-31T02:32:38.986Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
-    <lastmod>2026-01-31T02:32:39.217Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.builders.rl.html</loc>
-    <lastmod>2026-01-31T02:32:38.273Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.processor.html</loc>
-    <lastmod>2026-01-31T02:32:38.697Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html</loc>
-    <lastmod>2026-01-31T02:32:39.884Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.training_args.html</loc>
-    <lastmod>2026-01-31T02:32:38.289Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.adapter.html</loc>
-    <lastmod>2026-01-31T02:32:38.704Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
-    <lastmod>2026-01-31T02:32:38.505Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
-    <lastmod>2026-01-31T02:32:38.390Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html</loc>
-    <lastmod>2026-01-31T02:32:38.737Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html</loc>
-    <lastmod>2026-01-31T02:32:38.885Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html</loc>
-    <lastmod>2026-01-31T02:32:38.898Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
-    <lastmod>2026-01-31T02:32:39.208Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html</loc>
-    <lastmod>2026-01-31T02:32:38.925Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
-    <lastmod>2026-01-31T02:32:38.862Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
-    <lastmod>2026-01-31T02:32:38.318Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
-    <lastmod>2026-01-31T02:32:38.745Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
-    <lastmod>2026-01-31T02:32:38.962Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html</loc>
-    <lastmod>2026-01-31T02:32:38.974Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html</loc>
-    <lastmod>2026-01-31T02:32:39.553Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/convert.html</loc>
-    <lastmod>2026-01-31T02:32:38.191Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
-    <lastmod>2026-01-31T02:32:38.964Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
-    <lastmod>2026-01-31T02:32:39.472Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
-    <lastmod>2026-01-31T02:32:39.563Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
-    <lastmod>2026-01-31T02:32:39.219Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
-    <lastmod>2026-01-31T02:32:38.932Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
-    <lastmod>2026-01-31T02:32:38.658Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
-    <lastmod>2026-01-31T02:32:39.760Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.core.html</loc>
-    <lastmod>2026-01-31T02:32:39.789Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
-    <lastmod>2026-01-31T02:32:38.324Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
-    <lastmod>2026-01-31T02:32:39.011Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
-    <lastmod>2026-01-31T02:32:39.870Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html</loc>
-    <lastmod>2026-01-31T02:32:39.902Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
-    <lastmod>2026-01-31T02:32:38.816Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
-    <lastmod>2026-01-31T02:32:39.533Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html</loc>
-    <lastmod>2026-01-31T02:32:39.893Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.base.html</loc>
-    <lastmod>2026-01-31T02:32:38.775Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
-    <lastmod>2026-01-31T02:32:39.155Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.merge_lora.html</loc>
-    <lastmod>2026-01-31T02:32:38.491Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
-    <lastmod>2026-01-31T02:32:38.545Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
-    <lastmod>2026-01-31T02:32:38.773Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
-    <lastmod>2026-01-31T02:32:38.077Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
-    <lastmod>2026-01-31T02:32:38.945Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
-    <lastmod>2026-01-31T02:32:39.255Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
-    <lastmod>2026-01-31T02:32:38.911Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html</loc>
-    <lastmod>2026-01-31T02:32:38.672Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.lora.html</loc>
-    <lastmod>2026-01-31T02:32:39.307Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html</loc>
-    <lastmod>2026-01-31T02:32:38.732Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
-    <lastmod>2026-01-31T02:32:38.458Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
-    <lastmod>2026-01-31T02:32:39.167Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.batching.html</loc>
-    <lastmod>2026-01-31T02:32:39.811Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.quantization.html</loc>
-    <lastmod>2026-01-31T02:32:39.456Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
-    <lastmod>2026-01-31T02:32:39.414Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.quantize.html</loc>
-    <lastmod>2026-01-31T02:32:39.153Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.training.html</loc>
-    <lastmod>2026-01-31T02:32:39.489Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
-    <lastmod>2026-01-31T02:32:38.153Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html</loc>
-    <lastmod>2026-01-31T02:32:38.339Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/inference.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/FAQS.html</loc>
-    <lastmod>2026-01-31T02:29:03.091Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html</loc>
-    <lastmod>2026-01-31T02:29:03.103Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/index.html</loc>
-    <lastmod>2026-01-31T02:29:03.118Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
-    <lastmod>2026-01-31T02:29:03.093Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
-    <lastmod>2026-01-31T02:32:39.570Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
-    <lastmod>2026-01-31T02:32:39.132Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.builders.causal.html</loc>
-    <lastmod>2026-01-31T02:32:38.268Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
-    <lastmod>2026-01-31T02:32:38.636Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
-    <lastmod>2026-01-31T02:32:39.016Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
-    <lastmod>2026-01-31T02:32:38.330Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
-    <lastmod>2026-01-31T02:32:39.821Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
-    <lastmod>2026-01-31T02:32:38.878Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
-    <lastmod>2026-01-31T02:32:39.768Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.quantize.html</loc>
-    <lastmod>2026-01-31T02:32:38.521Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
-    <lastmod>2026-01-31T02:32:39.349Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html</loc>
-    <lastmod>2026-01-31T02:32:38.464Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/evaluate.html</loc>
-    <lastmod>2026-01-31T02:32:38.166Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
-    <lastmod>2026-01-31T02:32:39.166Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.model.html</loc>
-    <lastmod>2026-01-31T02:32:38.686Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.distributed.html</loc>
-    <lastmod>2026-01-31T02:32:39.407Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
-    <lastmod>2026-01-31T02:32:39.314Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
-    <lastmod>2026-01-31T02:32:39.120Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
-    <lastmod>2026-01-31T02:32:38.380Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
-    <lastmod>2026-01-31T02:32:39.764Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
-    <lastmod>2026-01-31T02:32:39.423Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
-    <lastmod>2026-01-31T02:32:38.543Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
-    <lastmod>2026-01-31T02:32:39.162Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.builders.base.html</loc>
-    <lastmod>2026-01-31T02:32:38.262Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.trl.html</loc>
-    <lastmod>2026-01-31T02:32:39.526Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.args.html</loc>
-    <lastmod>2026-01-31T02:32:38.559Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
-    <lastmod>2026-01-31T02:32:38.611Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html</loc>
-    <lastmod>2026-01-31T02:32:39.221Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
-    <lastmod>2026-01-31T02:32:39.164Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.model.html</loc>
-    <lastmod>2026-01-31T02:32:39.481Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
-    <lastmod>2026-01-31T02:32:38.984Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
-    <lastmod>2026-01-31T02:32:39.889Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
-    <lastmod>2026-01-31T02:32:39.786Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html</loc>
-    <lastmod>2026-01-31T02:32:39.512Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.fetch.html</loc>
-    <lastmod>2026-01-31T02:32:38.566Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
-    <lastmod>2026-01-31T02:32:38.958Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.relora.html</loc>
-    <lastmod>2026-01-31T02:32:39.172Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.evaluate.html</loc>
-    <lastmod>2026-01-31T02:32:38.400Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
-    <lastmod>2026-01-31T02:32:38.960Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
-    <lastmod>2026-01-31T02:32:38.674Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
-    <lastmod>2026-01-31T02:32:38.853Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
-    <lastmod>2026-01-31T02:32:39.301Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.data.streaming.html</loc>
-    <lastmod>2026-01-31T02:32:39.425Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
-    <lastmod>2026-01-31T02:32:39.319Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/common.architectures.html</loc>
-    <lastmod>2026-01-31T02:32:39.766Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
-    <lastmod>2026-01-31T02:32:38.436Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html</loc>
-    <lastmod>2026-01-31T02:32:38.644Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
-    <lastmod>2026-01-31T02:32:39.737Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.train.html</loc>
-    <lastmod>2026-01-31T02:32:38.594Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
-    <lastmod>2026-01-31T02:32:39.816Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.art.html</loc>
-    <lastmod>2026-01-31T02:32:38.428Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
-    <lastmod>2026-01-31T02:32:39.232Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
-    <lastmod>2026-01-31T02:32:38.254Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
-    <lastmod>2026-01-31T02:32:39.328Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html</loc>
-    <lastmod>2026-01-31T02:32:38.907Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
-    <lastmod>2026-01-31T02:32:38.833Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
-    <lastmod>2026-01-31T02:32:39.228Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
-    <lastmod>2026-01-31T02:32:39.787Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
-    <lastmod>2026-01-31T02:32:38.629Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
-    <lastmod>2026-01-31T02:32:38.893Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.constants.html</loc>
-    <lastmod>2026-01-31T02:32:38.725Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
-    <lastmod>2026-01-31T02:32:39.253Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
-    <lastmod>2026-01-31T02:32:38.530Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_tokenizers.html</loc>
-    <lastmod>2026-01-31T02:32:38.242Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.args.html</loc>
-    <lastmod>2026-01-31T02:32:38.424Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
-    <lastmod>2026-01-31T02:32:38.481Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.load.html</loc>
-    <lastmod>2026-01-31T02:32:38.572Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.preprocess.html</loc>
-    <lastmod>2026-01-31T02:32:38.515Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
-    <lastmod>2026-01-31T02:32:39.883Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
-    <lastmod>2026-01-31T02:32:39.878Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html</loc>
-    <lastmod>2026-01-31T02:32:38.320Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
-    <lastmod>2026-01-31T02:32:39.742Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
-    <lastmod>2026-01-31T02:32:39.751Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html</loc>
-    <lastmod>2026-01-31T02:32:39.242Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html</loc>
-    <lastmod>2026-01-31T02:32:38.322Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/gemma3n.html</loc>
-    <lastmod>2026-01-31T02:32:57.694Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/qwen3-next.html</loc>
-    <lastmod>2026-01-31T02:32:57.693Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/index.html</loc>
-    <lastmod>2026-01-31T02:32:57.697Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/magistral/think.html</loc>
-    <lastmod>2026-01-31T02:32:57.690Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/kimi-linear.html</loc>
-    <lastmod>2026-01-31T02:32:57.685Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/internvl3_5.html</loc>
-    <lastmod>2026-01-31T02:32:57.686Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/arcee.html</loc>
-    <lastmod>2026-01-31T02:32:57.687Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/LiquidAI.html</loc>
-    <lastmod>2026-01-31T02:32:57.696Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/magistral.html</loc>
-    <lastmod>2026-01-31T02:32:57.690Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/voxtral.html</loc>
-    <lastmod>2026-01-31T02:32:57.691Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/trinity.html</loc>
-    <lastmod>2026-01-31T02:32:57.687Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/ministral.html</loc>
-    <lastmod>2026-01-31T02:32:57.691Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/llama-4.html</loc>
-    <lastmod>2026-01-31T02:32:57.692Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/devstral.html</loc>
-    <lastmod>2026-01-31T02:32:57.692Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/ministral3.html</loc>
-    <lastmod>2026-01-31T02:32:57.688Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/ministral3/vision.html</loc>
-    <lastmod>2026-01-31T02:32:57.689Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/models/plano.html</loc>
-    <lastmod>2026-01-31T02:32:57.685Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/reward_modelling.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/quantize.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/fsdp_qlora.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/quantize.html</loc>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/reward_modelling.html</loc>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/plano.html</loc>
+    <lastmod>2026-02-10T11:10:38.432Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/ministral3/vision.html</loc>
+    <lastmod>2026-02-10T11:10:38.435Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/ministral3.html</loc>
+    <lastmod>2026-02-10T11:10:38.435Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/devstral.html</loc>
+    <lastmod>2026-02-10T11:10:38.439Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/llama-4.html</loc>
+    <lastmod>2026-02-10T11:10:38.439Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/ministral.html</loc>
+    <lastmod>2026-02-10T11:10:38.437Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/trinity.html</loc>
+    <lastmod>2026-02-10T11:10:38.433Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/voxtral.html</loc>
+    <lastmod>2026-02-10T11:10:38.438Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/magistral.html</loc>
+    <lastmod>2026-02-10T11:10:38.436Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/LiquidAI.html</loc>
+    <lastmod>2026-02-10T11:10:38.443Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/arcee.html</loc>
+    <lastmod>2026-02-10T11:10:38.434Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/internvl3_5.html</loc>
+    <lastmod>2026-02-10T11:10:38.433Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/kimi-linear.html</loc>
+    <lastmod>2026-02-10T11:10:38.431Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/magistral/think.html</loc>
+    <lastmod>2026-02-10T11:10:38.437Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/index.html</loc>
+    <lastmod>2026-02-10T11:10:38.445Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/qwen3-next.html</loc>
+    <lastmod>2026-02-10T11:10:38.440Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/gemma3n.html</loc>
+    <lastmod>2026-02-10T11:10:38.441Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html</loc>
+    <lastmod>2026-02-10T11:10:17.580Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html</loc>
+    <lastmod>2026-02-10T11:10:18.503Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
+    <lastmod>2026-02-10T11:10:19.020Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
+    <lastmod>2026-02-10T11:10:19.010Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html</loc>
+    <lastmod>2026-02-10T11:10:17.578Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
+    <lastmod>2026-02-10T11:10:19.148Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
+    <lastmod>2026-02-10T11:10:19.153Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.preprocess.html</loc>
+    <lastmod>2026-02-10T11:10:17.775Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.load.html</loc>
+    <lastmod>2026-02-10T11:10:17.831Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
+    <lastmod>2026-02-10T11:10:17.741Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.args.html</loc>
+    <lastmod>2026-02-10T11:10:17.684Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_tokenizers.html</loc>
+    <lastmod>2026-02-10T11:10:17.500Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
+    <lastmod>2026-02-10T11:10:17.789Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
+    <lastmod>2026-02-10T11:10:18.514Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.constants.html</loc>
+    <lastmod>2026-02-10T11:10:17.986Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
+    <lastmod>2026-02-10T11:10:18.150Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
+    <lastmod>2026-02-10T11:10:17.889Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
+    <lastmod>2026-02-10T11:10:19.056Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
+    <lastmod>2026-02-10T11:10:18.489Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
+    <lastmod>2026-02-10T11:10:18.093Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html</loc>
+    <lastmod>2026-02-10T11:10:18.164Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
+    <lastmod>2026-02-10T11:10:18.591Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
+    <lastmod>2026-02-10T11:10:17.512Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
+    <lastmod>2026-02-10T11:10:18.493Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.art.html</loc>
+    <lastmod>2026-02-10T11:10:17.688Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
+    <lastmod>2026-02-10T11:10:19.085Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.train.html</loc>
+    <lastmod>2026-02-10T11:10:17.853Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
+    <lastmod>2026-02-10T11:10:19.005Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html</loc>
+    <lastmod>2026-02-10T11:10:17.904Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
+    <lastmod>2026-02-10T11:10:17.696Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/common.architectures.html</loc>
+    <lastmod>2026-02-10T11:10:19.034Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
+    <lastmod>2026-02-10T11:10:18.581Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.data.streaming.html</loc>
+    <lastmod>2026-02-10T11:10:18.688Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
+    <lastmod>2026-02-10T11:10:18.564Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
+    <lastmod>2026-02-10T11:10:18.110Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
+    <lastmod>2026-02-10T11:10:17.934Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
+    <lastmod>2026-02-10T11:10:18.219Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.evaluate.html</loc>
+    <lastmod>2026-02-10T11:10:17.659Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.relora.html</loc>
+    <lastmod>2026-02-10T11:10:18.432Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
+    <lastmod>2026-02-10T11:10:18.217Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.fetch.html</loc>
+    <lastmod>2026-02-10T11:10:17.824Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html</loc>
+    <lastmod>2026-02-10T11:10:18.775Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
+    <lastmod>2026-02-10T11:10:19.055Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
+    <lastmod>2026-02-10T11:10:19.159Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
+    <lastmod>2026-02-10T11:10:18.243Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.model.html</loc>
+    <lastmod>2026-02-10T11:10:18.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
+    <lastmod>2026-02-10T11:10:18.424Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html</loc>
+    <lastmod>2026-02-10T11:10:18.482Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
+    <lastmod>2026-02-10T11:10:17.870Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.args.html</loc>
+    <lastmod>2026-02-10T11:10:17.817Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.trl.html</loc>
+    <lastmod>2026-02-10T11:10:18.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.builders.base.html</loc>
+    <lastmod>2026-02-10T11:10:17.519Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
+    <lastmod>2026-02-10T11:10:18.422Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
+    <lastmod>2026-02-10T11:10:17.801Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
+    <lastmod>2026-02-10T11:10:18.686Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
+    <lastmod>2026-02-10T11:10:19.032Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
+    <lastmod>2026-02-10T11:10:17.639Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
+    <lastmod>2026-02-10T11:10:18.379Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
+    <lastmod>2026-02-10T11:10:18.576Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.distributed.html</loc>
+    <lastmod>2026-02-10T11:10:18.670Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.model.html</loc>
+    <lastmod>2026-02-10T11:10:17.946Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
+    <lastmod>2026-02-10T11:10:18.426Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/evaluate.html</loc>
+    <lastmod>2026-02-10T11:10:17.423Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html</loc>
+    <lastmod>2026-02-10T11:10:17.723Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
+    <lastmod>2026-02-10T11:10:18.612Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.quantize.html</loc>
+    <lastmod>2026-02-10T11:10:17.781Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
+    <lastmod>2026-02-10T11:10:19.036Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
+    <lastmod>2026-02-10T11:10:18.136Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
+    <lastmod>2026-02-10T11:10:19.091Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
+    <lastmod>2026-02-10T11:10:17.588Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
+    <lastmod>2026-02-10T11:10:18.275Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
+    <lastmod>2026-02-10T11:10:17.896Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.builders.causal.html</loc>
+    <lastmod>2026-02-10T11:10:17.525Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
+    <lastmod>2026-02-10T11:10:18.392Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
+    <lastmod>2026-02-10T11:10:18.835Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/index.html</loc>
+    <lastmod>2026-02-10T11:06:59.595Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html</loc>
+    <lastmod>2026-02-10T11:06:59.580Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/FAQS.html</loc>
+    <lastmod>2026-02-10T11:06:59.569Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/inference.html</loc>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html</loc>
+    <lastmod>2026-02-10T11:10:17.598Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
+    <lastmod>2026-02-10T11:10:17.410Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.training.html</loc>
+    <lastmod>2026-02-10T11:10:18.753Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.quantize.html</loc>
+    <lastmod>2026-02-10T11:10:18.413Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
+    <lastmod>2026-02-10T11:10:18.677Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.quantization.html</loc>
+    <lastmod>2026-02-10T11:10:18.719Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.batching.html</loc>
+    <lastmod>2026-02-10T11:10:19.080Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
+    <lastmod>2026-02-10T11:10:18.427Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
+    <lastmod>2026-02-10T11:10:17.718Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html</loc>
+    <lastmod>2026-02-10T11:10:17.993Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.lora.html</loc>
+    <lastmod>2026-02-10T11:10:18.570Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html</loc>
+    <lastmod>2026-02-10T11:10:17.932Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
+    <lastmod>2026-02-10T11:10:18.169Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
+    <lastmod>2026-02-10T11:10:18.516Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
+    <lastmod>2026-02-10T11:10:18.204Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
+    <lastmod>2026-02-10T11:10:17.334Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
+    <lastmod>2026-02-10T11:10:18.033Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
+    <lastmod>2026-02-10T11:10:17.803Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.merge_lora.html</loc>
+    <lastmod>2026-02-10T11:10:17.751Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
+    <lastmod>2026-02-10T11:10:18.415Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.base.html</loc>
+    <lastmod>2026-02-10T11:10:18.035Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html</loc>
+    <lastmod>2026-02-10T11:10:19.164Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
+    <lastmod>2026-02-10T11:10:18.796Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
+    <lastmod>2026-02-10T11:10:18.076Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html</loc>
+    <lastmod>2026-02-10T11:10:19.172Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
+    <lastmod>2026-02-10T11:10:19.140Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
+    <lastmod>2026-02-10T11:10:18.270Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
+    <lastmod>2026-02-10T11:10:17.582Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.core.html</loc>
+    <lastmod>2026-02-10T11:10:19.058Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
+    <lastmod>2026-02-10T11:10:19.028Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
+    <lastmod>2026-02-10T11:10:17.917Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
+    <lastmod>2026-02-10T11:10:18.191Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
+    <lastmod>2026-02-10T11:10:18.480Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
+    <lastmod>2026-02-10T11:10:18.829Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
+    <lastmod>2026-02-10T11:10:18.735Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
+    <lastmod>2026-02-10T11:10:18.222Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/convert.html</loc>
+    <lastmod>2026-02-10T11:10:17.447Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html</loc>
+    <lastmod>2026-02-10T11:10:18.817Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html</loc>
+    <lastmod>2026-02-10T11:10:18.233Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
+    <lastmod>2026-02-10T11:10:18.221Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
+    <lastmod>2026-02-10T11:10:18.005Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
+    <lastmod>2026-02-10T11:10:17.576Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
+    <lastmod>2026-02-10T11:10:18.120Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html</loc>
+    <lastmod>2026-02-10T11:10:18.183Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
+    <lastmod>2026-02-10T11:10:18.468Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html</loc>
+    <lastmod>2026-02-10T11:10:18.156Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html</loc>
+    <lastmod>2026-02-10T11:10:18.143Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html</loc>
+    <lastmod>2026-02-10T11:10:17.997Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
+    <lastmod>2026-02-10T11:10:17.649Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
+    <lastmod>2026-02-10T11:10:17.765Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.adapter.html</loc>
+    <lastmod>2026-02-10T11:10:17.964Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.training_args.html</loc>
+    <lastmod>2026-02-10T11:10:17.547Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html</loc>
+    <lastmod>2026-02-10T11:10:19.155Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.processor.html</loc>
+    <lastmod>2026-02-10T11:10:17.958Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.builders.rl.html</loc>
+    <lastmod>2026-02-10T11:10:17.531Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
+    <lastmod>2026-02-10T11:10:18.478Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
+    <lastmod>2026-02-10T11:10:18.245Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
+    <lastmod>2026-02-10T11:10:19.009Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
+    <lastmod>2026-02-10T11:10:18.404Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html</loc>
+    <lastmod>2026-02-10T11:10:18.553Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
+    <lastmod>2026-02-10T11:10:17.794Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
+    <lastmod>2026-02-10T11:10:18.095Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
+    <lastmod>2026-02-10T11:10:18.177Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
+    <lastmod>2026-02-10T11:10:18.786Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.liger.args.html</loc>
+    <lastmod>2026-02-10T11:10:19.024Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.patch_manager.html</loc>
+    <lastmod>2026-02-10T11:10:17.984Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html</loc>
+    <lastmod>2026-02-10T11:10:18.501Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
+    <lastmod>2026-02-10T11:10:18.696Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html</loc>
+    <lastmod>2026-02-10T11:10:18.520Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html</loc>
+    <lastmod>2026-02-10T11:10:18.434Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.tokenizer.html</loc>
+    <lastmod>2026-02-10T11:10:17.956Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
+    <lastmod>2026-02-10T11:10:18.562Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
+    <lastmod>2026-02-10T11:10:17.430Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.sweeps.html</loc>
+    <lastmod>2026-02-10T11:10:17.838Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
+    <lastmod>2026-02-10T11:10:18.645Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/mimo.html</loc>
+    <lastmod>2026-02-10T11:10:38.432Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/jamba.html</loc>
+    <lastmod>2026-02-10T11:10:38.444Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/magistral/vision.html</loc>
+    <lastmod>2026-02-10T11:10:38.437Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/llama-2.html</loc>
+    <lastmod>2026-02-10T11:10:38.440Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/smolvlm2.html</loc>
+    <lastmod>2026-02-10T11:10:38.443Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/mistral-small.html</loc>
+    <lastmod>2026-02-10T11:10:38.438Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/mistral.html</loc>
+    <lastmod>2026-02-10T11:10:38.439Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/hunyuan.html</loc>
+    <lastmod>2026-02-10T11:10:38.444Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/orpheus.html</loc>
+    <lastmod>2026-02-10T11:10:38.445Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/qwen3.html</loc>
+    <lastmod>2026-02-10T11:10:38.440Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/seed-oss.html</loc>
+    <lastmod>2026-02-10T11:10:38.442Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/granite4.html</loc>
+    <lastmod>2026-02-10T11:10:38.443Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/olmo3.html</loc>
+    <lastmod>2026-02-10T11:10:38.433Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/phi.html</loc>
+    <lastmod>2026-02-10T11:10:38.442Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/gpt-oss.html</loc>
+    <lastmod>2026-02-10T11:10:38.442Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/ministral3/think.html</loc>
+    <lastmod>2026-02-10T11:10:38.435Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/models/apertus.html</loc>
+    <lastmod>2026-02-10T11:10:38.441Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/optimizers.html</loc>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/attention.html</loc>
+    <lastmod>2026-02-10T11:06:59.570Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/nd_parallelism.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/batch_vs_grad.html</loc>
-    <lastmod>2026-01-31T02:29:03.093Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.570Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multi-node.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/rlhf.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/pretraining.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/tokenized.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/template_free.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/input_output.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/docker.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/checkpoint_saving.html</loc>
-    <lastmod>2026-01-31T02:29:03.093Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.570Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multipack.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/qat.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
-    <lastmod>2026-01-31T02:29:03.094Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/nccl.html</loc>
-    <lastmod>2026-01-31T02:29:03.097Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/telemetry.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/unsloth.html</loc>
-    <lastmod>2026-01-31T02:29:03.098Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.575Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html</loc>
-    <lastmod>2026-01-31T02:29:03.123Z</lastmod>
+    <lastmod>2026-02-10T11:06:59.600Z</lastmod>
   </url>
 </urlset>
diff --git a/src/axolotl/integrations/LICENSE.html b/src/axolotl/integrations/LICENSE.html
index 9a47c9e39..b40b55414 100644
--- a/src/axolotl/integrations/LICENSE.html
+++ b/src/axolotl/integrations/LICENSE.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
index 9df6689b6..87e70b6d8 100644
--- a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
+++ b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
@@ -612,6 +612,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Optimizers</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../docs/attention.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Attention</span></a>
+  </div>
 </li>
       </ul>
   </li>