diff --git a/.nojekyll b/.nojekyll
index 66c2f2e33..d0f8f3511 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-3f6621ed
\ No newline at end of file
+36ffb084
\ No newline at end of file
diff --git a/FAQS.html b/FAQS.html
index 616640792..c852cda19 100644
--- a/FAQS.html
+++ b/FAQS.html
@@ -181,6 +181,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="./docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/TODO.html b/TODO.html
index 83b76b3f6..f800a8985 100644
--- a/TODO.html
+++ b/TODO.html
@@ -181,6 +181,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="./docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/amd_hpc.html b/docs/amd_hpc.html
index 4a14f3a9d..02812a9cc 100644
--- a/docs/amd_hpc.html
+++ b/docs/amd_hpc.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link active">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/batch_vs_grad.html b/docs/batch_vs_grad.html
index c479e7201..5d0436512 100644
--- a/docs/batch_vs_grad.html
+++ b/docs/batch_vs_grad.html
@@ -182,6 +182,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/config.html b/docs/config.html
index 2e66a2c80..0a5f2029c 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/conversation.html b/docs/dataset-formats/conversation.html
index e7754df51..9ed8389a4 100644
--- a/docs/dataset-formats/conversation.html
+++ b/docs/dataset-formats/conversation.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/index.html b/docs/dataset-formats/index.html
index 42f3ffd5b..b8c00207d 100644
--- a/docs/dataset-formats/index.html
+++ b/docs/dataset-formats/index.html
@@ -242,6 +242,12 @@ window.Quarto = {
   <a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
@@ -369,7 +375,7 @@ Description
 </tr>
 </thead>
 <tbody class="list">
-<tr data-index="0" data-listing-file-modified-sort="1738127324747" data-listing-reading-time-sort="1" data-listing-word-count-sort="92" data-listing-title-sort="Pre-training" data-listing-filename-sort="pretraining.qmd">
+<tr data-index="0" data-listing-file-modified-sort="1738127430206" data-listing-reading-time-sort="1" data-listing-word-count-sort="92" data-listing-title-sort="Pre-training" data-listing-filename-sort="pretraining.qmd">
 <td>
 <a href="../../docs/dataset-formats/pretraining.html" class="title listing-title">Pre-training</a>
 </td>
@@ -377,7 +383,7 @@ Description
 <span class="listing-description">Data format for a pre-training completion task.</span>
 </td>
 </tr>
-<tr data-index="1" data-listing-file-modified-sort="1738127324747" data-listing-reading-time-sort="2" data-listing-word-count-sort="308" data-listing-title-sort="Instruction Tuning" data-listing-filename-sort="inst_tune.qmd">
+<tr data-index="1" data-listing-file-modified-sort="1738127430206" data-listing-reading-time-sort="2" data-listing-word-count-sort="308" data-listing-title-sort="Instruction Tuning" data-listing-filename-sort="inst_tune.qmd">
 <td>
 <a href="../../docs/dataset-formats/inst_tune.html" class="title listing-title">Instruction Tuning</a>
 </td>
@@ -385,7 +391,7 @@ Description
 <span class="listing-description">Instruction tuning formats for supervised fine-tuning.</span>
 </td>
 </tr>
-<tr data-index="2" data-listing-file-modified-sort="1738127324747" data-listing-reading-time-sort="4" data-listing-word-count-sort="625" data-listing-title-sort="Conversation" data-listing-filename-sort="conversation.qmd">
+<tr data-index="2" data-listing-file-modified-sort="1738127430206" data-listing-reading-time-sort="4" data-listing-word-count-sort="625" data-listing-title-sort="Conversation" data-listing-filename-sort="conversation.qmd">
 <td>
 <a href="../../docs/dataset-formats/conversation.html" class="title listing-title">Conversation</a>
 </td>
@@ -393,7 +399,7 @@ Description
 <span class="listing-description">Conversation format for supervised fine-tuning.</span>
 </td>
 </tr>
-<tr data-index="3" data-listing-file-modified-sort="1738127324747" data-listing-reading-time-sort="1" data-listing-word-count-sort="87" data-listing-title-sort="Stepwise Supervised Format" data-listing-filename-sort="stepwise_supervised.qmd">
+<tr data-index="3" data-listing-file-modified-sort="1738127430206" data-listing-reading-time-sort="1" data-listing-word-count-sort="87" data-listing-title-sort="Stepwise Supervised Format" data-listing-filename-sort="stepwise_supervised.qmd">
 <td>
 <a href="../../docs/dataset-formats/stepwise_supervised.html" class="title listing-title">Stepwise Supervised Format</a>
 </td>
@@ -401,7 +407,7 @@ Description
 <span class="listing-description">Format for datasets with stepwise completions and labels</span>
 </td>
 </tr>
-<tr data-index="4" data-listing-file-modified-sort="1738127324747" data-listing-reading-time-sort="1" data-listing-word-count-sort="3" data-listing-title-sort="Template-Free" data-listing-filename-sort="template_free.qmd">
+<tr data-index="4" data-listing-file-modified-sort="1738127430206" data-listing-reading-time-sort="1" data-listing-word-count-sort="3" data-listing-title-sort="Template-Free" data-listing-filename-sort="template_free.qmd">
 <td>
 <a href="../../docs/dataset-formats/template_free.html" class="title listing-title">Template-Free</a>
 </td>
@@ -409,7 +415,7 @@ Description
 <span class="listing-description">Construct prompts without a template.</span>
 </td>
 </tr>
-<tr data-index="5" data-listing-file-modified-sort="1738127324747" data-listing-reading-time-sort="1" data-listing-word-count-sort="92" data-listing-title-sort="Custom Pre-Tokenized Dataset" data-listing-filename-sort="tokenized.qmd">
+<tr data-index="5" data-listing-file-modified-sort="1738127430206" data-listing-reading-time-sort="1" data-listing-word-count-sort="92" data-listing-title-sort="Custom Pre-Tokenized Dataset" data-listing-filename-sort="tokenized.qmd">
 <td>
 <a href="../../docs/dataset-formats/tokenized.html" class="title listing-title">Custom Pre-Tokenized Dataset</a>
 </td>
diff --git a/docs/dataset-formats/inst_tune.html b/docs/dataset-formats/inst_tune.html
index 7552abda8..6bb1ddaec 100644
--- a/docs/dataset-formats/inst_tune.html
+++ b/docs/dataset-formats/inst_tune.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/pretraining.html b/docs/dataset-formats/pretraining.html
index 85a03915a..42fd5c786 100644
--- a/docs/dataset-formats/pretraining.html
+++ b/docs/dataset-formats/pretraining.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/stepwise_supervised.html b/docs/dataset-formats/stepwise_supervised.html
index a3fa20c28..13798ca35 100644
--- a/docs/dataset-formats/stepwise_supervised.html
+++ b/docs/dataset-formats/stepwise_supervised.html
@@ -182,6 +182,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/template_free.html b/docs/dataset-formats/template_free.html
index 28a3f0bd8..398c3fd5e 100644
--- a/docs/dataset-formats/template_free.html
+++ b/docs/dataset-formats/template_free.html
@@ -182,6 +182,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/tokenized.html b/docs/dataset-formats/tokenized.html
index aee0578ea..e73f94f4f 100644
--- a/docs/dataset-formats/tokenized.html
+++ b/docs/dataset-formats/tokenized.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset_preprocessing.html b/docs/dataset_preprocessing.html
index 9488f4616..30da72e02 100644
--- a/docs/dataset_preprocessing.html
+++ b/docs/dataset_preprocessing.html
@@ -182,6 +182,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/debugging.html b/docs/debugging.html
index 27e08979a..9e7b765e2 100644
--- a/docs/debugging.html
+++ b/docs/debugging.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/faq.html b/docs/faq.html
index fee0770c3..77e1ccfc2 100644
--- a/docs/faq.html
+++ b/docs/faq.html
@@ -182,6 +182,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/fsdp_qlora.html b/docs/fsdp_qlora.html
index 5efe81c74..682ad75bd 100644
--- a/docs/fsdp_qlora.html
+++ b/docs/fsdp_qlora.html
@@ -182,6 +182,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/images/ray-cluster-dashboard.png b/docs/images/ray-cluster-dashboard.png
new file mode 100644
index 000000000..f0b4beb56
Binary files /dev/null and b/docs/images/ray-cluster-dashboard.png differ
diff --git a/docs/input_output.html b/docs/input_output.html
index 492542ac5..183029527 100644
--- a/docs/input_output.html
+++ b/docs/input_output.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/lr_groups.html b/docs/lr_groups.html
index 06c3e104e..d4443b360 100644
--- a/docs/lr_groups.html
+++ b/docs/lr_groups.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/mac.html b/docs/mac.html
index b3ffd3f11..133ac4865 100644
--- a/docs/mac.html
+++ b/docs/mac.html
@@ -182,6 +182,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/multi-node.html b/docs/multi-node.html
index a373ea3f8..ca798e0f9 100644
--- a/docs/multi-node.html
+++ b/docs/multi-node.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/multimodal.html b/docs/multimodal.html
index 78338600b..168b1d1ee 100644
--- a/docs/multimodal.html
+++ b/docs/multimodal.html
@@ -215,6 +215,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/multipack.html b/docs/multipack.html
index 0464065fb..13321d835 100644
--- a/docs/multipack.html
+++ b/docs/multipack.html
@@ -182,6 +182,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/nccl.html b/docs/nccl.html
index 2a31d4dff..cbbffdaaf 100644
--- a/docs/nccl.html
+++ b/docs/nccl.html
@@ -182,6 +182,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/ray-integration.html b/docs/ray-integration.html
new file mode 100644
index 000000000..0fc3560f0
--- /dev/null
+++ b/docs/ray-integration.html
@@ -0,0 +1,836 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.6.40">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+<meta name="description" content="How to use Axolotl with Ray Train">
+
+<title>Ray Train integration – Axolotl</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../favicon.jpg" rel="icon" type="image/jpeg">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting-549806ee2085284f45b00abea8c6df48.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap-1d8d3285ed62e8239ae07b1b029f75b0.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="light">
+<script id="quarto-search-options" type="application/json">{
+  "location": "navbar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "end",
+  "type": "overlay",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "show-item-context": false,
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+
+<link rel="stylesheet" href="../styles.css">
+</head>
+
+<body class="nav-sidebar docked nav-fixed">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+    <nav class="navbar navbar-expand " data-bs-theme="dark">
+      <div class="navbar-container container-fluid">
+      <div class="navbar-brand-container mx-auto">
+    <a class="navbar-brand" href="../index.html">
+    <span class="navbar-title">Axolotl</span>
+    </a>
+  </div>
+        <div class="quarto-navbar-tools tools-wide tools-end">
+    <a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
+    <a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
+</div>
+          <div id="quarto-search" class="" title="Search"></div>
+      </div> <!-- /container-fluid -->
+    </nav>
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/debugging.html">How-To Guides</a></li><li class="breadcrumb-item"><a href="../docs/ray-integration.html">Ray Train integration</a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Home</span></a>
+  </div>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
+ <span class="menu-text">How-To Guides</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/debugging.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Debugging</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multipack.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multipack (Sample Packing)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">FDSP + QLoRA</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/input_output.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Template-free prompt construction</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/rlhf.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">RLHF (Beta)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nccl.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">NCCL</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/mac.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mac M-series</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multi-node.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multi Node</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Unsloth</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Formats</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Pre-training</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Instruction Tuning</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Conversation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Stepwise Supervised Format</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Template-Free</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true">
+ <span class="menu-text">Reference</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/config.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Config options</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/faq.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">FAQ</span></a>
+  </div>
+</li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">On this page</h2>
+   
+  <ul>
+  <li><a href="#ray-cluster-setup" id="toc-ray-cluster-setup" class="nav-link active" data-scroll-target="#ray-cluster-setup">Ray cluster setup</a></li>
+  <li><a href="#sanity-check" id="toc-sanity-check" class="nav-link" data-scroll-target="#sanity-check">Sanity check</a></li>
+  <li><a href="#configuring-training-with-ray-train" id="toc-configuring-training-with-ray-train" class="nav-link" data-scroll-target="#configuring-training-with-ray-train">Configuring training with Ray Train</a></li>
+  <li><a href="#launching-training" id="toc-launching-training" class="nav-link" data-scroll-target="#launching-training">Launching training</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/debugging.html">How-To Guides</a></li><li class="breadcrumb-item"><a href="../docs/ray-integration.html">Ray Train integration</a></li></ol></nav>
+<div class="quarto-title">
+<h1 class="title">Ray Train integration</h1>
+</div>
+
+<div>
+  <div class="description">
+    How to use Axolotl with Ray Train
+  </div>
+</div>
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<p>Axolotl supports using Ray as an alternative to <code>accelerate</code> for orchestrating training. This is especially useful for multi-node training since you only have to setup code and dependencies in a single node and launch training as if you were using a single node.</p>
+<p>With the <code>--use-ray</code> CLI flag, Axolotl will use Ray Train’s <a href="https://docs.ray.io/en/latest/train/api/doc/ray.train.torch.TorchTrainer.html#ray.train.torch.TorchTrainer"><code>TorchTrainer</code></a> to run training.</p>
+<section id="ray-cluster-setup" class="level2">
+<h2 class="anchored" data-anchor-id="ray-cluster-setup">Ray cluster setup</h2>
+<p>A prerequisite using the Ray Train integration is to setup a Ray cluster on your desired node(s). For a detailed guide on how you can get started with ray clusters, check the official Ray docs here: https://docs.ray.io/en/latest/cluster/getting-started.html</p>
+<p>Every Ray cluster has one <em>head</em> node and a set of worker nodes. The head node is just like any other worker node, but it also runs certain special processes related to scheduling and orchestration. Ray-enabled scripts are run on the head node and depending on the resources (number of CPUs, GPUs, etc) they request, will be scheduled to run certain tasks on the worker nodes. For more on key concepts behind a Ray cluster, you can refer this <a href="https://docs.ray.io/en/latest/cluster/key-concepts.html#cluster-key-concepts">doc</a>.</p>
+</section>
+<section id="sanity-check" class="level2">
+<h2 class="anchored" data-anchor-id="sanity-check">Sanity check</h2>
+<p>To run a sanity check on whether your ray cluster is setup properly, execute the following on the head node:</p>
+<div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="ex">ray</span> status</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The output should have a summary of your Ray cluster - list of all the nodes in your cluster, the number of CPUs and GPUs in your cluster, etc. For example, if you have a cluster with 1 CPU-only head node and 2 4xL40S worker nodes, the output can look like this:</p>
+<pre><code>Node status
+---------------------------------------------------------------
+Active:
+ 1 head
+Idle:
+ 2 4xL40S:48CPU-384GB
+Pending:
+ (no pending nodes)
+Recent failures:
+ (no failures)
+
+Resources
+---------------------------------------------------------------
+Usage:
+ 0.0/96.0 CPU
+ 0.0/8.0 GPU
+ 0B/800.00GiB memory
+ 0B/229.57GiB object_store_memory
+
+Demands:
+ (no resource demands)</code></pre>
+<p>You should also be able to see the same on the <a href="https://docs.ray.io/en/latest/ray-observability/getting-started.html">Ray dashboard</a>.</p>
+</section>
+<section id="configuring-training-with-ray-train" class="level2">
+<h2 class="anchored" data-anchor-id="configuring-training-with-ray-train">Configuring training with Ray Train</h2>
+<p>You can find an example configuration at <code>configs/llama-3/lora-1b-ray.yaml</code>.</p>
+<p>The key parameters to note here are:</p>
+<div class="sourceCode" id="cb3"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co">...</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="co">use_ray: true</span></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="co">ray_num_workers: 4</span></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="co"># optional</span></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="co">resources_per_worker:</span></span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="co">    GPU: 1</span></span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="co">...</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<ul>
+<li><code>use_ray</code>: This is the flag that enables the Ray Train integration. You can either use the corresponding <code>--use-ray</code> flag in the CLI or set <code>use_ray</code> in the config file.</li>
+<li><code>ray_num_workers</code>: This is the number of workers/GPUs to use for training.</li>
+<li><code>resources_per_worker</code>: This is the Ray <a href="https://docs.ray.io/en/latest/ray-core/scheduling/resources.html">resource request</a> for each worker. This can be used to request a specific GPU type or a custom resource for each worker. For example, if your ray cluster has GPUs of different types, and you only want to use NVIDIA L40S GPUs, you can do</li>
+</ul>
+<div class="sourceCode" id="cb4"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">resources_per_worker</span><span class="kw">:</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="at">    accelerator_type</span><span class="fu">:L40S</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.001</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="launching-training" class="level2">
+<h2 class="anchored" data-anchor-id="launching-training">Launching training</h2>
+<p>You can simply run the following command on the head node:</p>
+<div class="sourceCode" id="cb5"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train examples/llama-3/lora-1b-ray.yml <span class="at">--use-ray</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>This will launch training on the head node and workers will be scheduled automatically by Ray Train to run on the appropriate head or worker nodes.</p>
+<p>You can also monitor training progress on the Ray dashboard.</p>
+<p>Coming back to the example on a Ray cluster with 1 head node and 2 4xL40S worker nodes, let’s say you want to make use of all 8 GPUs. You would be able to just set <code>ray_num_workers: 8</code> and run the previous command. The Cluster tab will show the following:</p>
+<div class="quarto-figure quarto-figure-center">
+<figure class="figure">
+<p><img src="./images/ray-cluster-dashboard.png" class="img-fluid figure-img"></p>
+<figcaption>Ray dashboard</figcaption>
+</figure>
+</div>
+
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp("https:\/\/axolotl-ai-cloud\.github\.io\/axolotl\/");
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/reward_modelling.html b/docs/reward_modelling.html
index 106ee2d6b..2c61d755a 100644
--- a/docs/reward_modelling.html
+++ b/docs/reward_modelling.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/rlhf.html b/docs/rlhf.html
index 990e6f39c..044c420e1 100644
--- a/docs/rlhf.html
+++ b/docs/rlhf.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/torchao.html b/docs/torchao.html
index 1743ccd43..263951685 100644
--- a/docs/torchao.html
+++ b/docs/torchao.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/unsloth.html b/docs/unsloth.html
index 2c159e144..e0da9bf6b 100644
--- a/docs/unsloth.html
+++ b/docs/unsloth.html
@@ -216,6 +216,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/examples/colab-notebooks/colab-axolotl-example.html b/examples/colab-notebooks/colab-axolotl-example.html
index b8d0b260c..3db956fdd 100644
--- a/examples/colab-notebooks/colab-axolotl-example.html
+++ b/examples/colab-notebooks/colab-axolotl-example.html
@@ -244,6 +244,12 @@ window.Quarto = {
   <a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/index.html b/index.html
index b2dffcfa3..466e3a232 100644
--- a/index.html
+++ b/index.html
@@ -215,6 +215,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
   <a href="./docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/search.json b/search.json
index f68132eb2..078fa9f38 100644
--- a/search.json
+++ b/search.json
@@ -196,125 +196,37 @@
     ]
   },
   {
-    "objectID": "docs/faq.html",
-    "href": "docs/faq.html",
-    "title": "FAQ",
+    "objectID": "docs/dataset_preprocessing.html",
+    "href": "docs/dataset_preprocessing.html",
+    "title": "Dataset Preprocessing",
     "section": "",
-    "text": "Q: The trainer stopped and hasn’t progressed in several minutes.\n\nA: Usually an issue with the GPUs communicating with each other. See the NCCL doc\n\nQ: Exitcode -9\n\nA: This usually happens when you run out of system RAM.\n\nQ: Exitcode -7 while using deepspeed\n\nA: Try upgrading deepspeed w: pip install -U deepspeed\n\nQ: AttributeError: ‘DummyOptim’ object has no attribute ‘step’\n\nA: You may be using deepspeed with single gpu. Please don’t set deepspeed: in yaml or cli.",
-    "crumbs": [
-      "FAQ"
-    ]
+    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside the (dataset format)[../dataset-formats/] and prompt strategies to: - parse the dataset based on the dataset format - transform the dataset to how you would interact with the model based on the prompt strategy - tokenize the dataset based on the configured model & tokenizer - shuffle and merge multiple datasets together if using more than one\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling python -m axolotl.cli.preprocess /path/to/your.yaml --debug\nWhen training is started\n\nWhat are the benefits of pre-processing? When training interactively or for sweeps (e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly slow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent training parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example YAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a default path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly setting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed data is in the cache.\nWhat are the edge cases? Let’s say you are writing a custom prompt strategy or using a user-defined prompt template. Because the trainer cannot readily detect these changes, we cannot change the calculated hash value for the pre-processed dataset. If you have dataset_prepared_path: ... set and change your prompt templating logic, it may not pick up the changes you made and you will be training over the old prompt."
   },
   {
-    "objectID": "docs/multi-node.html",
-    "href": "docs/multi-node.html",
-    "title": "Multi Node",
+    "objectID": "docs/batch_vs_grad.html",
+    "href": "docs/batch_vs_grad.html",
+    "title": "Batch size vs Gradient accumulation",
     "section": "",
-    "text": "You will need to create a configuration for accelerate, either by using accelerate config and follow the instructions or you can use one of the preset below:\n~/.cache/huggingface/accelerate/default_config.yaml\nConfigure your model to use FSDP with for example:",
+    "text": "Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn’t significantly impact learning.\nThis method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here’s why:\n\nMemory Consumption with Batch Size: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.\nGradient Accumulation: With gradient accumulation, you’re effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you’re only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.\n\nExample 1: Micro batch size: 3 Gradient accumulation steps: 2 Number of GPUs: 3 Total batch size = 3 * 2 * 3 = 18\n| GPU 1          | GPU 2          | GPU 3          |\n|----------------|----------------|----------------|\n| S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |\n| e1, e2, e3     | e4, e5, e6     | e7, e8, e9     |\n|----------------|----------------|----------------|\n| → (accumulate) | → (accumulate) | → (accumulate) |\n|----------------|----------------|----------------|\n| S10, S11, S12  | S13, S14, S15  | S16, S17, S18  |\n| e10, e11, e12  | e13, e14, e15  | e16, e17, e18  |\n|----------------|----------------|----------------|\n| → (apply)      | → (apply)      | → (apply)      |\n\nAccumulated gradient for the weight w1 after the second iteration (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18\n\nWeight update for w1:\nw1_new = w1_old - learning rate x (Total gradient for w1 / 18)\nExample 2: Micro batch size: 2 Gradient accumulation steps: 1 Number of GPUs: 3 Total batch size = 2 * 1 * 3 = 6\n| GPU 1     | GPU 2     | GPU 3     |\n|-----------|-----------|-----------|\n| S1, S2    | S3, S4    | S5, S6    |\n| e1, e2    | e3, e4    | e5, e6    |\n|-----------|-----------|-----------|\n| → (apply) | → (apply) | → (apply) |\n\nAccumulated gradient for the weight w1 (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6\n\nWeight update for w1:\nw1_new = w1_old - learning rate × (Total gradient for w1 / 6)"
+  },
+  {
+    "objectID": "docs/multimodal.html",
+    "href": "docs/multimodal.html",
+    "title": "MultiModal / Vision Language Models (BETA)",
+    "section": "",
+    "text": "MultiModal / Vision Language Models (BETA)\n\nSupported Models\n\nMllama, i.e. llama with vision models\n\n\n\nUsage\nCurrently multimodal support is limited and doesn’t have full feature parity. To finetune a multimodal Llama w/ LoRA, you’ll need to use the following in YAML in combination with the rest of the required hyperparams.\nbase_model: alpindale/Llama-3.2-11B-Vision-Instruct\nprocessor_type: AutoProcessor\nskip_prepare_dataset: true\n\nchat_template: llama3_2_vision\ndatasets:\n  - path: HuggingFaceH4/llava-instruct-mix-vsft\n    type: chat_template\n    split: train[:1%]\n    field_messages: messages\nremove_unused_columns: false\nsample_packing: false\n\n# only finetune the Language model, leave the vision model and vision tower frozen\nlora_target_modules: 'language_model.model.layers.[\\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'"
+  },
+  {
+    "objectID": "docs/mac.html",
+    "href": "docs/mac.html",
+    "title": "Mac M-series",
+    "section": "",
+    "text": "Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.\nCurrent support:\n\nSupport for all models\nFull training of models\nLoRA training\nSample packing\nFP16 and BF16 (awaiting AMP support for MPS in Pytorch)\nTri-dao’s flash-attn (until it is supported use spd_attention as an alternative)\nxformers\nbitsandbytes (meaning no 4/8 bits loading and bnb optimizers)\nqlora\nDeepSpeed\n\nUntested: - FSDP",
     "crumbs": [
       "How-To Guides",
-      "Multi Node"
+      "Mac M-series"
     ]
   },
-  {
-    "objectID": "docs/multi-node.html#machine-configuration",
-    "href": "docs/multi-node.html#machine-configuration",
-    "title": "Multi Node",
-    "section": "Machine configuration",
-    "text": "Machine configuration\nOn each machine you need a copy of Axolotl, we suggest using the same commit to ensure compatibility.\nYou will also need to have the same configuration file for your model on each machine.\nOn the main machine only, make sure the port you set as main_process_port is open in TCP and reachable by other machines.\nAll you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.",
-    "crumbs": [
-      "How-To Guides",
-      "Multi Node"
-    ]
-  },
-  {
-    "objectID": "docs/unsloth.html",
-    "href": "docs/unsloth.html",
-    "title": "Unsloth",
-    "section": "",
-    "text": "Overview\nUnsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over standard industry baselines.\n\n\nInstallation\nThe following will install the correct unsloth and extras from source.\npython scripts/unsloth_install.py | sh\n\n\nUsing unsloth w Axolotl\nAxolotl exposes a few configuration options to try out unsloth and get most of the performance gains.\nOur unsloth integration is currently limited to the following model architectures: - llama\nThese options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning\nunsloth_lora_mlp: true\nunsloth_lora_qkv: true\nunsloth_lora_o: true\nThese options are composable and can be used with multi-gpu finetuning\nunsloth_cross_entropy_loss: true\nunsloth_rms_norm: true\nunsloth_rope: true\n\n\nLimitations\n\nSingle GPU only; e.g. no multi-gpu support\nNo deepspeed or FSDP support (requires multi-gpu)\nLoRA + QLoRA support only. No full fine tunes or fp8 support.\nLimited model architecture support. Llama, Phi, Gemma, Mistral only\nNo MoE support.",
-    "crumbs": [
-      "How-To Guides",
-      "Unsloth"
-    ]
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html",
-    "title": "Setting up",
-    "section": "",
-    "text": "import torch\n# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\nassert (torch.cuda.is_available()==True)\n!pip install --no-build-isolation axolotl[deepspeed]"
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#hugging-face-login-optional",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#hugging-face-login-optional",
-    "title": "Setting up",
-    "section": "Hugging Face login (optional)",
-    "text": "Hugging Face login (optional)\n\nfrom huggingface_hub import notebook_login\nnotebook_login()"
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#example-configuration",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#example-configuration",
-    "title": "Setting up",
-    "section": "Example configuration",
-    "text": "Example configuration\n\nimport yaml\n\nyaml_string = \"\"\"\nbase_model: NousResearch/Meta-Llama-3.1-8B\n\nload_in_8bit: false\nload_in_4bit: true\nstrict: false\n\ndatasets:\n  - path: tatsu-lab/alpaca\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.05\noutput_dir: ./outputs/lora-out\n\nsequence_len: 2048\nsample_packing: true\neval_sample_packing: true\npad_to_sequence_len: true\n\nadapter: qlora\nlora_model_dir:\nlora_r: 32\nlora_alpha: 16\nlora_dropout: 0.05\nlora_target_linear: true\nlora_fan_in_fan_out:\nlora_modules_to_save:\n  - embed_tokens\n  - lm_head\n\nwandb_project:\nwandb_entity:\nwandb_watch:\nwandb_name:\nwandb_log_model:\n\ngradient_accumulation_steps: 2\nmicro_batch_size: 1\nnum_epochs: 1\noptimizer: paged_adamw_8bit\nlr_scheduler: cosine\nlearning_rate: 2e-5\n\ntrain_on_inputs: false\ngroup_by_length: false\nbf16: auto\nfp16:\ntf32: false\n\ngradient_checkpointing: true\nearly_stopping_patience:\nresume_from_checkpoint:\nlogging_steps: 1\nxformers_attention:\nflash_attention: false\nsdp_attention: true\n\nwarmup_steps: 1\nmax_steps: 25\nevals_per_epoch: 1\neval_table_size:\nsaves_per_epoch: 1\ndebug:\ndeepspeed:\nweight_decay: 0.0\nfsdp:\nfsdp_config:\nspecial_tokens:\n  pad_token: &lt;|end_of_text|&gt;\n\"\"\"\n\n\n# Convert the YAML string to a Python dictionary\nyaml_dict = yaml.safe_load(yaml_string)\n\n# Specify your file path\nfile_path = 'test_axolotl.yaml'\n\n# Write the YAML file\nwith open(file_path, 'w') as file:\n    yaml.dump(yaml_dict, file)\n\nAbove we have a configuration file with base LLM model and datasets specified, among many other things. Axolotl can automatically detect whether the specified datasets are on HuggingFace repo or local machine.\nThe Axolotl configuration options encompass model and dataset selection, data pre-processing, and training. Let’s go through them line by line:\n\n“base model”: String value, specifies the underlying pre-trained LLM that will be used for finetuning\n\nNext we have options for model weights quantization. Quantization allows for reduction in occupied memory on GPUs.\n\n“load_in_8bit”: Boolean value, whether to quantize the model weights into 8-bit integer.\n“load_in_4bit”: Boolean value, whether to quantize the model weights into 4-bit integer.\n“strict”: Boolean value. If false, it allows for overriding established configuration options in the yaml file when executing in command-line interface.\n“datasets”: a list of dicts that contain path and type of data sets as well as other optional configurations where datasets are concerned. Supports multiple datasets.\n“val_set_size”: Either a float value less than one or an integer less than the total size of dataset. Sets the size of validation set from the whole dataset. If float, sets the proportion of the dataset assigned for validation. If integer, sets the direct size of validation set.\n“output_dir”: String value. Path of trained model.\n\nFor data preprocessing:\n\n“sequence_len”: Integer. Specifies the maximum sequence length of the input. Typically 2048 or less.\n“pad_to_sequence_len”: Boolean. Padding input to maximum sequence length.\n“sample_packing”: Boolean. Specifies whether to use multi-packing with block diagonal attention.\n“special_tokens”: Python dict, optional. Allows users to specify the additional special tokens to be ignored by the tokenizer.\n\nFor LoRA configuration and its hyperparamters:\n\n“adapter”: String. Either “lora” or “qlora”, depending on user’s choice.\n“lora_model_dir”: String, Optional. Path to directory that contains LoRA model, if there is already a trained LoRA model the user would like to use.\n“lora_r”: Integer. Refers to the rank of LoRA decomposition matrices. Higher value will reduce LoRA efficiency. Recommended to be set to 8.\n“lora_alpha”: Integer. Scale the weight matrices by \\(\\frac{\\text{lora_alpha}}{\\text{lora_r}}\\)Recommended to be fixed at 16.\n“lora_dropout”: Float that is 1 or less. The dropout probability of a lora layer.\n“lora_target_linear”: Boolean. If true, lora will target all linear modules in the transformers architecture.\n“lora_modules_to_save”: If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n\nSee LoRA for detailed explanation of LoRA implementation.\nFor the training configurations:\n\n“gradient_accumulation_steps”: Integer. The number of steps over which to accumulate gradient for batch training. E.g. if 2, backprop is performed every two steps.\n“micro_batch_size”: Integer. Batch size per gpu / gradient_accumulation_steps\n“num_epochs”: Integer. Number of epochs. One epoch is when training has looped over every batch in the whole data set once.\n“optimizer”: The optimizer to use for the training.\n“learning_rate”: The learning rate.\n“lr_scheduler”: The learning rate scheduler to use for adjusting learning rate during training.\n“train_on_inputs”: Boolean. Whether to ignore or include the user’s prompt from the training labels.\n“group_by_length”: Boolean. Whether to group similarly sized data to minimize padding.\n“bf16”: Either “auto”, “true”, or “false”. Whether to use CUDA bf16 floating point format. If set to “auto”, will automatically apply bf16 should the gpu supports it.\n“fp16”: Optional. Specifies whether to use CUDA fp16. Automatically set to true if “bf16” is set to true. Otherwise false.\n“tf32”: Boolean. Whether to use CUDA tf32. Will override bf16.\n“gradient_checkpointing”: Boolean. Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\n“gradient_checkpointing_kwargs”: Python Dict. Fed into the trainer.\n“logging_steps”: Integer. Log training information over every specified number of steps.\n“flash_attention”: Boolean. Whether to use the flash attention mechanism.\n“sdp_attention”: Boolean. Whether to use the Scaled Dot Product attention mechanism (the attention mechanism in the original implementation of transformers.)\n“warmup_steps”: Integer. The number of pre-training steps where a very low learning rate is used.\n“evals_per_epoch”: Integer. Number of evaluations to be performed within one training epoch.\n“saves_per_epoch”: Integer. Number of times the model is saved in one training epoch.\n“weight_decay”: Positive Float. Sets the “strength” of weight decay (i.e. setting the coefficient of L2 regularization)\n\nThe above is but a snippet aiming to get users familiarized with the types of streamlined configuration options axolotl provides. For a full list of configuration options, see here\nTrain the model\n\n!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml\n\nPredict with trained model\n\n!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n    --lora_model_dir=\"./outputs/lora-out\" --gradio"
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#deeper-dive",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#deeper-dive",
-    "title": "Setting up",
-    "section": "Deeper Dive",
-    "text": "Deeper Dive\nIt is also helpful to gain some familiarity over some of the core inner workings of axolotl"
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#configuration-normalization",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#configuration-normalization",
-    "title": "Setting up",
-    "section": "Configuration Normalization",
-    "text": "Configuration Normalization\nAxolotl uses a custom Dict class, called DictDefault to store configurations specified in the yaml configuration file (into a Python variable named cfg). The definition for this custom Dict can be found in the utils/dict.py\nDictDefault is amended such that calling a missing key from it will result in a None return type. This is important because if some configuration options aren’t specified by the user, the None type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out utils/config/init.py"
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#loading-models-tokenizers-and-trainer",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#loading-models-tokenizers-and-trainer",
-    "title": "Setting up",
-    "section": "Loading Models, Tokenizers, and Trainer",
-    "text": "Loading Models, Tokenizers, and Trainer\nIf we inspect cli.train.py, we will find that most of the heavy lifting were done by the function train() which is itself imported from src/axolotl/train.py.\ntrain() takes care of loading the appropriate tokenizer and pre-trained model through load_model() and load_tokenizer() from src/axolotl/utils/models.py respectively.\nload_tokenizer() loads in the appropriate tokenizer given the desired model, as well as chat templates.\nModelLoader class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ModelLoader will utilize the corresponding “attention hijacking” script. For example, if the user specified the base model to be NousResearch/Meta-Llama-3.1-8B, which is of llama type, and set flash_attn to True, ModelLoader will load in llama_attn_hijack_flash.py. For a list of supported attention hijacking, please refer to the directory /src/axolotl/monkeypatch/\nAnother important operation encompassed in train() is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of setup_trainer() from /src/axolotl/utils/trainer.py, which in turn relies on modules from /src/axolotl/core/trainer_builder.py. trainer_builder.py provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning (‘dpo’, ‘ipo’, ‘kto’) )"
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#monkey-patch",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#monkey-patch",
-    "title": "Setting up",
-    "section": "Monkey patch",
-    "text": "Monkey patch\nThe Monkey patch directory is where model architecture/optimization patching scripts are stored (these are modifications that are not implemented in the official releases, hence the name monkey patch). It includes attention jacking, ReLoRA, and unsloth optimization."
-  },
-  {
-    "objectID": "TODO.html",
-    "href": "TODO.html",
-    "title": "todo list",
-    "section": "",
-    "text": "[] Validation of parameters for combinations that won’t work\n\n\n\n\nFSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203\nadamw_bnb_8bit doesn’t play well with FSDP offload"
-  },
-  {
-    "objectID": "TODO.html#things-that-are-known-not-to-work",
-    "href": "TODO.html#things-that-are-known-not-to-work",
-    "title": "todo list",
-    "section": "",
-    "text": "FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203\nadamw_bnb_8bit doesn’t play well with FSDP offload"
-  },
-  {
-    "objectID": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html",
-    "href": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html",
-    "title": "Axolotl",
-    "section": "",
-    "text": "Acknowledgements\nPortions of this Cut Cross Entropy Software may utilize the following copyrighted material, the use of which is hereby acknowledged.\n\nPyTorch\nFrom PyTorch:\n\nCopyright (c) 2016-     Facebook, Inc            (Adam Paszke)\nCopyright (c) 2014-     Facebook, Inc            (Soumith Chintala)\nCopyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)\nCopyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)\nCopyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)\nCopyright (c) 2011-2013 NYU                      (Clement Farabet)\nCopyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)\nCopyright (c) 2006      Idiap Research Institute (Samy Bengio)\nCopyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)\n\nFrom Caffe2:\n\nCopyright (c) 2016-present, Facebook Inc. All rights reserved.\n\nAll contributions by Facebook:\nCopyright (c) 2016 Facebook Inc.\n\nAll contributions by Google:\nCopyright (c) 2015 Google Inc.\nAll rights reserved.\n\nAll contributions by Yangqing Jia:\nCopyright (c) 2015 Yangqing Jia\nAll rights reserved.\n\nAll contributions by Kakao Brain:\nCopyright 2019-2020 Kakao Brain\n\nAll contributions by Cruise LLC:\nCopyright (c) 2022 Cruise LLC.\nAll rights reserved.\n\nAll contributions by Arm:\nCopyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates\n\nAll contributions from Caffe:\nCopyright(c) 2013, 2014, 2015, the respective contributors\nAll rights reserved.\n\nAll other contributions:\nCopyright(c) 2015, 2016 the respective contributors\nAll rights reserved.\n\nCaffe2 uses a copyright model similar to Caffe: each contributor holds\ncopyright over their contributions to Caffe2. The project versioning records\nall such contribution and copyright details. If a contributor wants to further\nmark their specific copyright on a particular contribution, they should\nindicate their copyright solely in the commit message of the change when it is\ncommitted.\n\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n1. Redistributions of source code must retain the above copyright\nnotice, this list of conditions and the following disclaimer.\n\n2. Redistributions in binary form must reproduce the above copyright\nnotice, this list of conditions and the following disclaimer in the\ndocumentation and/or other materials provided with the distribution.\n\n3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America\nand IDIAP Research Institute nor the names of its contributors may be\nused to endorse or promote products derived from this software without\nspecific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\nARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\nLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\nCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\nSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\nINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\nCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\nARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\nPOSSIBILITY OF SUCH DAMAGE.\nTriton\n/*\n* Copyright 2018-2020 Philippe Tillet\n* Copyright 2020-2022 OpenAI\n*\n* Permission is hereby granted, free of charge, to any person obtaining\n* a copy of this software and associated documentation files\n* (the \"Software\"), to deal in the Software without restriction,\n* including without limitation the rights to use, copy, modify, merge,\n* publish, distribute, sublicense, and/or sell copies of the Software,\n* and to permit persons to whom the Software is furnished to do so,\n* subject to the following conditions:\n*\n* The above copyright notice and this permission notice shall be\n* included in all copies or substantial portions of the Software.\n*\n* THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\n* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n*/\nTransformers\nCopyright 2018- The Hugging Face team. All rights reserved.\n\n                                Apache License\n                        Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\nTERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n1. Definitions.\n\n    \"License\" shall mean the terms and conditions for use, reproduction,\n    and distribution as defined by Sections 1 through 9 of this document.\n\n    \"Licensor\" shall mean the copyright owner or entity authorized by\n    the copyright owner that is granting the License.\n\n    \"Legal Entity\" shall mean the union of the acting entity and all\n    other entities that control, are controlled by, or are under common\n    control with that entity. For the purposes of this definition,\n    \"control\" means (i) the power, direct or indirect, to cause the\n    direction or management of such entity, whether by contract or\n    otherwise, or (ii) ownership of fifty percent (50%) or more of the\n    outstanding shares, or (iii) beneficial ownership of such entity.\n\n    \"You\" (or \"Your\") shall mean an individual or Legal Entity\n    exercising permissions granted by this License.\n\n    \"Source\" form shall mean the preferred form for making modifications,\n    including but not limited to software source code, documentation\n    source, and configuration files.\n\n    \"Object\" form shall mean any form resulting from mechanical\n    transformation or translation of a Source form, including but\n    not limited to compiled object code, generated documentation,\n    and conversions to other media types.\n\n    \"Work\" shall mean the work of authorship, whether in Source or\n    Object form, made available under the License, as indicated by a\n    copyright notice that is included in or attached to the work\n    (an example is provided in the Appendix below).\n\n    \"Derivative Works\" shall mean any work, whether in Source or Object\n    form, that is based on (or derived from) the Work and for which the\n    editorial revisions, annotations, elaborations, or other modifications\n    represent, as a whole, an original work of authorship. For the purposes\n    of this License, Derivative Works shall not include works that remain\n    separable from, or merely link (or bind by name) to the interfaces of,\n    the Work and Derivative Works thereof.\n\n    \"Contribution\" shall mean any work of authorship, including\n    the original version of the Work and any modifications or additions\n    to that Work or Derivative Works thereof, that is intentionally\n    submitted to Licensor for inclusion in the Work by the copyright owner\n    or by an individual or Legal Entity authorized to submit on behalf of\n    the copyright owner. For the purposes of this definition, \"submitted\"\n    means any form of electronic, verbal, or written communication sent\n    to the Licensor or its representatives, including but not limited to\n    communication on electronic mailing lists, source code control systems,\n    and issue tracking systems that are managed by, or on behalf of, the\n    Licensor for the purpose of discussing and improving the Work, but\n    excluding communication that is conspicuously marked or otherwise\n    designated in writing by the copyright owner as \"Not a Contribution.\"\n\n    \"Contributor\" shall mean Licensor and any individual or Legal Entity\n    on behalf of whom a Contribution has been received by Licensor and\n    subsequently incorporated within the Work.\n\n2. Grant of Copyright License. Subject to the terms and conditions of\n    this License, each Contributor hereby grants to You a perpetual,\n    worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n    copyright license to reproduce, prepare Derivative Works of,\n    publicly display, publicly perform, sublicense, and distribute the\n    Work and such Derivative Works in Source or Object form.\n\n3. Grant of Patent License. Subject to the terms and conditions of\n    this License, each Contributor hereby grants to You a perpetual,\n    worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n    (except as stated in this section) patent license to make, have made,\n    use, offer to sell, sell, import, and otherwise transfer the Work,\n    where such license applies only to those patent claims licensable\n    by such Contributor that are necessarily infringed by their\n    Contribution(s) alone or by combination of their Contribution(s)\n    with the Work to which such Contribution(s) was submitted. If You\n    institute patent litigation against any entity (including a\n    cross-claim or counterclaim in a lawsuit) alleging that the Work\n    or a Contribution incorporated within the Work constitutes direct\n    or contributory patent infringement, then any patent licenses\n    granted to You under this License for that Work shall terminate\n    as of the date such litigation is filed.\n\n4. Redistribution. You may reproduce and distribute copies of the\n    Work or Derivative Works thereof in any medium, with or without\n    modifications, and in Source or Object form, provided that You\n    meet the following conditions:\n\n    (a) You must give any other recipients of the Work or\n        Derivative Works a copy of this License; and\n\n    (b) You must cause any modified files to carry prominent notices\n        stating that You changed the files; and\n\n    (c) You must retain, in the Source form of any Derivative Works\n        that You distribute, all copyright, patent, trademark, and\n        attribution notices from the Source form of the Work,\n        excluding those notices that do not pertain to any part of\n        the Derivative Works; and\n\n    (d) If the Work includes a \"NOTICE\" text file as part of its\n        distribution, then any Derivative Works that You distribute must\n        include a readable copy of the attribution notices contained\n        within such NOTICE file, excluding those notices that do not\n        pertain to any part of the Derivative Works, in at least one\n        of the following places: within a NOTICE text file distributed\n        as part of the Derivative Works; within the Source form or\n        documentation, if provided along with the Derivative Works; or,\n        within a display generated by the Derivative Works, if and\n        wherever such third-party notices normally appear. The contents\n        of the NOTICE file are for informational purposes only and\n        do not modify the License. You may add Your own attribution\n        notices within Derivative Works that You distribute, alongside\n        or as an addendum to the NOTICE text from the Work, provided\n        that such additional attribution notices cannot be construed\n        as modifying the License.\n\n    You may add Your own copyright statement to Your modifications and\n    may provide additional or different license terms and conditions\n    for use, reproduction, or distribution of Your modifications, or\n    for any such Derivative Works as a whole, provided Your use,\n    reproduction, and distribution of the Work otherwise complies with\n    the conditions stated in this License.\n\n5. Submission of Contributions. Unless You explicitly state otherwise,\n    any Contribution intentionally submitted for inclusion in the Work\n    by You to the Licensor shall be under the terms and conditions of\n    this License, without any additional terms or conditions.\n    Notwithstanding the above, nothing herein shall supersede or modify\n    the terms of any separate license agreement you may have executed\n    with Licensor regarding such Contributions.\n\n6. Trademarks. This License does not grant permission to use the trade\n    names, trademarks, service marks, or product names of the Licensor,\n    except as required for reasonable and customary use in describing the\n    origin of the Work and reproducing the content of the NOTICE file.\n\n7. Disclaimer of Warranty. Unless required by applicable law or\n    agreed to in writing, Licensor provides the Work (and each\n    Contributor provides its Contributions) on an \"AS IS\" BASIS,\n    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n    implied, including, without limitation, any warranties or conditions\n    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n    PARTICULAR PURPOSE. You are solely responsible for determining the\n    appropriateness of using or redistributing the Work and assume any\n    risks associated with Your exercise of permissions under this License.\n\n8. Limitation of Liability. In no event and under no legal theory,\n    whether in tort (including negligence), contract, or otherwise,\n    unless required by applicable law (such as deliberate and grossly\n    negligent acts) or agreed to in writing, shall any Contributor be\n    liable to You for damages, including any direct, indirect, special,\n    incidental, or consequential damages of any character arising as a\n    result of this License or out of the use or inability to use the\n    Work (including but not limited to damages for loss of goodwill,\n    work stoppage, computer failure or malfunction, or any and all\n    other commercial damages or losses), even if such Contributor\n    has been advised of the possibility of such damages.\n\n9. Accepting Warranty or Additional Liability. While redistributing\n    the Work or Derivative Works thereof, You may choose to offer,\n    and charge a fee for, acceptance of support, warranty, indemnity,\n    or other liability obligations and/or rights consistent with this\n    License. However, in accepting such obligations, You may act only\n    on Your own behalf and on Your sole responsibility, not on behalf\n    of any other Contributor, and only if You agree to indemnify,\n    defend, and hold each Contributor harmless for any liability\n    incurred by, or claims asserted against, such Contributor by reason\n    of your accepting any such warranty or additional liability.\n\nEND OF TERMS AND CONDITIONS\n\nAPPENDIX: How to apply the Apache License to your work.\n\n    To apply the Apache License to your work, attach the following\n    boilerplate notice, with the fields enclosed by brackets \"[]\"\n    replaced with your own identifying information. (Don't include\n    the brackets!)  The text should be enclosed in the appropriate\n    comment syntax for the file format. We also recommend that a\n    file or class name and description of purpose be included on the\n    same \"printed page\" as the copyright notice for easier\n    identification within third-party archives.\n\nCopyright [yyyy] [name of copyright owner]\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n    http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License."
-  },
-  {
-    "objectID": "src/axolotl/integrations/LICENSE.html",
-    "href": "src/axolotl/integrations/LICENSE.html",
-    "title": "Axolotl",
-    "section": "",
-    "text": "AXOLOTL COMMUNITY LICENSE AGREEMENT\nThis Axolotl Community License Agreement (“Agreement”) is entered into by and between Axolotl AI Corp. (“Axolotl”) and any individual or entity (“Licensee”) who wishes to use the Software (as defined below) in accordance with the terms and conditions set forth in this Agreement.\n\nDefinitions 1.1 “Licensee” refers to any individual or entity who has obtained a copy of the Software under this Agreement. 1.2 “Plugin Integration” means independent integration software modules which may or may not be offered by Axolotl, which may be licensed separately by their respective authors and/or licensors. 1.3 “Software” refers to the specific sub-directory of the Axolotl, Inc. software located at https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations and its subdirectories which permits Plugin Integrations to integrate with the Axolotl service.\nGrant of License 2.1 Axolotl hereby grants Licensee a worldwide, non-exclusive, royalty-free, license to use, copy, modify, merge, publish, distribute, sublicense, and/or otherwise exploit the Software, subject to the following conditions: - Licensee must comply with all the terms and conditions of this Agreement. - Licensee must include the original copyright notice and disclaimer of warranty in all copies or substantial portions of the Software. 2.2 Licensee may use the Software for any lawful purpose, except as restricted in Section 3.\nRestrictions 3.1 Licensee shall not use the Software for any activity that constitutes a commercial activity of offering for free or for sale any services, platform, or equivalent to third parties for the purposes of allowing such third parties to fine-tune artificial intelligence models. 3.2 Licensee shall not: - Use the Software for any illegal or unauthorized purpose. - Reverse engineer, decompile, or disassemble the Software. - Remove or modify any copyright, trademark, or other proprietary notices contained in the Software. - Use the Software in a way that could damage, disable, overburden, or impair the functionality of the Software or interfere with any third-party use of the Software. 3.3 Axolotl reserves the right to restrict certain Plugin Integrations for use with the Software. To the extent Licensee integrates a permitted, applicable Plugin Integration with the Software, Licensee shall comply with any additional terms and conditions imposed by the licensors of such Plugin Integration for use of such Plugin Integrations. Licensee shall contact Axolotl if it has questions about whether its use of the Software falls beyond the scope of this Agreement.\nIntellectual Property Rights 4.1 Axolotl and its contributors retain all intellectual property rights in and to the Software. Licensee acknowledges that this Agreement does not transfer any ownership rights or intellectual property rights to Licensee.\nDisclaimer of Warranty 5.1 THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\nTermination 6.1 Axolotl may terminate this Agreement at any time if Licensee fails to comply with any of the terms and conditions set forth herein. Upon termination, Licensee shall cease all use of the Software and destroy any copies in its possession.\nGoverning Law 7.1 This Agreement shall be governed by and construed in accordance with the laws of the State of California, without regards to conflicts of laws provisions thereof.\nEntire Agreement 8.1 This Agreement constitutes the entire agreement between Axolotl and Licensee with respect to the subject matter hereof and supersedes all prior or contemporaneous understandings or agreements between the parties concerning the Software, whether written or oral. Axolotl may update the terms of this Agreement from time to time, and Licensee’s continued use of the Software after any such updates shall constitute acceptance of updated terms on a go-forward basis. Axolotl will use commercially reasonable efforts to provide Licensee notice of any material updates. By using the Software, Licensee acknowledges that it has read, understood, and agrees to be bound by the terms and conditions of this Agreement.\n\nThis Agreement was last updated on August 23, 2024."
-  },
   {
     "objectID": "index.html",
     "href": "index.html",
@@ -416,36 +328,179 @@
     ]
   },
   {
-    "objectID": "docs/mac.html",
-    "href": "docs/mac.html",
-    "title": "Mac M-series",
+    "objectID": "src/axolotl/integrations/LICENSE.html",
+    "href": "src/axolotl/integrations/LICENSE.html",
+    "title": "Axolotl",
     "section": "",
-    "text": "Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.\nCurrent support:\n\nSupport for all models\nFull training of models\nLoRA training\nSample packing\nFP16 and BF16 (awaiting AMP support for MPS in Pytorch)\nTri-dao’s flash-attn (until it is supported use spd_attention as an alternative)\nxformers\nbitsandbytes (meaning no 4/8 bits loading and bnb optimizers)\nqlora\nDeepSpeed\n\nUntested: - FSDP",
+    "text": "AXOLOTL COMMUNITY LICENSE AGREEMENT\nThis Axolotl Community License Agreement (“Agreement”) is entered into by and between Axolotl AI Corp. (“Axolotl”) and any individual or entity (“Licensee”) who wishes to use the Software (as defined below) in accordance with the terms and conditions set forth in this Agreement.\n\nDefinitions 1.1 “Licensee” refers to any individual or entity who has obtained a copy of the Software under this Agreement. 1.2 “Plugin Integration” means independent integration software modules which may or may not be offered by Axolotl, which may be licensed separately by their respective authors and/or licensors. 1.3 “Software” refers to the specific sub-directory of the Axolotl, Inc. software located at https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations and its subdirectories which permits Plugin Integrations to integrate with the Axolotl service.\nGrant of License 2.1 Axolotl hereby grants Licensee a worldwide, non-exclusive, royalty-free, license to use, copy, modify, merge, publish, distribute, sublicense, and/or otherwise exploit the Software, subject to the following conditions: - Licensee must comply with all the terms and conditions of this Agreement. - Licensee must include the original copyright notice and disclaimer of warranty in all copies or substantial portions of the Software. 2.2 Licensee may use the Software for any lawful purpose, except as restricted in Section 3.\nRestrictions 3.1 Licensee shall not use the Software for any activity that constitutes a commercial activity of offering for free or for sale any services, platform, or equivalent to third parties for the purposes of allowing such third parties to fine-tune artificial intelligence models. 3.2 Licensee shall not: - Use the Software for any illegal or unauthorized purpose. - Reverse engineer, decompile, or disassemble the Software. - Remove or modify any copyright, trademark, or other proprietary notices contained in the Software. - Use the Software in a way that could damage, disable, overburden, or impair the functionality of the Software or interfere with any third-party use of the Software. 3.3 Axolotl reserves the right to restrict certain Plugin Integrations for use with the Software. To the extent Licensee integrates a permitted, applicable Plugin Integration with the Software, Licensee shall comply with any additional terms and conditions imposed by the licensors of such Plugin Integration for use of such Plugin Integrations. Licensee shall contact Axolotl if it has questions about whether its use of the Software falls beyond the scope of this Agreement.\nIntellectual Property Rights 4.1 Axolotl and its contributors retain all intellectual property rights in and to the Software. Licensee acknowledges that this Agreement does not transfer any ownership rights or intellectual property rights to Licensee.\nDisclaimer of Warranty 5.1 THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\nTermination 6.1 Axolotl may terminate this Agreement at any time if Licensee fails to comply with any of the terms and conditions set forth herein. Upon termination, Licensee shall cease all use of the Software and destroy any copies in its possession.\nGoverning Law 7.1 This Agreement shall be governed by and construed in accordance with the laws of the State of California, without regards to conflicts of laws provisions thereof.\nEntire Agreement 8.1 This Agreement constitutes the entire agreement between Axolotl and Licensee with respect to the subject matter hereof and supersedes all prior or contemporaneous understandings or agreements between the parties concerning the Software, whether written or oral. Axolotl may update the terms of this Agreement from time to time, and Licensee’s continued use of the Software after any such updates shall constitute acceptance of updated terms on a go-forward basis. Axolotl will use commercially reasonable efforts to provide Licensee notice of any material updates. By using the Software, Licensee acknowledges that it has read, understood, and agrees to be bound by the terms and conditions of this Agreement.\n\nThis Agreement was last updated on August 23, 2024."
+  },
+  {
+    "objectID": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html",
+    "href": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html",
+    "title": "Axolotl",
+    "section": "",
+    "text": "Acknowledgements\nPortions of this Cut Cross Entropy Software may utilize the following copyrighted material, the use of which is hereby acknowledged.\n\nPyTorch\nFrom PyTorch:\n\nCopyright (c) 2016-     Facebook, Inc            (Adam Paszke)\nCopyright (c) 2014-     Facebook, Inc            (Soumith Chintala)\nCopyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)\nCopyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)\nCopyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)\nCopyright (c) 2011-2013 NYU                      (Clement Farabet)\nCopyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)\nCopyright (c) 2006      Idiap Research Institute (Samy Bengio)\nCopyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)\n\nFrom Caffe2:\n\nCopyright (c) 2016-present, Facebook Inc. All rights reserved.\n\nAll contributions by Facebook:\nCopyright (c) 2016 Facebook Inc.\n\nAll contributions by Google:\nCopyright (c) 2015 Google Inc.\nAll rights reserved.\n\nAll contributions by Yangqing Jia:\nCopyright (c) 2015 Yangqing Jia\nAll rights reserved.\n\nAll contributions by Kakao Brain:\nCopyright 2019-2020 Kakao Brain\n\nAll contributions by Cruise LLC:\nCopyright (c) 2022 Cruise LLC.\nAll rights reserved.\n\nAll contributions by Arm:\nCopyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates\n\nAll contributions from Caffe:\nCopyright(c) 2013, 2014, 2015, the respective contributors\nAll rights reserved.\n\nAll other contributions:\nCopyright(c) 2015, 2016 the respective contributors\nAll rights reserved.\n\nCaffe2 uses a copyright model similar to Caffe: each contributor holds\ncopyright over their contributions to Caffe2. The project versioning records\nall such contribution and copyright details. If a contributor wants to further\nmark their specific copyright on a particular contribution, they should\nindicate their copyright solely in the commit message of the change when it is\ncommitted.\n\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n1. Redistributions of source code must retain the above copyright\nnotice, this list of conditions and the following disclaimer.\n\n2. Redistributions in binary form must reproduce the above copyright\nnotice, this list of conditions and the following disclaimer in the\ndocumentation and/or other materials provided with the distribution.\n\n3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America\nand IDIAP Research Institute nor the names of its contributors may be\nused to endorse or promote products derived from this software without\nspecific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\nARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\nLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\nCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\nSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\nINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\nCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\nARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\nPOSSIBILITY OF SUCH DAMAGE.\nTriton\n/*\n* Copyright 2018-2020 Philippe Tillet\n* Copyright 2020-2022 OpenAI\n*\n* Permission is hereby granted, free of charge, to any person obtaining\n* a copy of this software and associated documentation files\n* (the \"Software\"), to deal in the Software without restriction,\n* including without limitation the rights to use, copy, modify, merge,\n* publish, distribute, sublicense, and/or sell copies of the Software,\n* and to permit persons to whom the Software is furnished to do so,\n* subject to the following conditions:\n*\n* The above copyright notice and this permission notice shall be\n* included in all copies or substantial portions of the Software.\n*\n* THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\n* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n*/\nTransformers\nCopyright 2018- The Hugging Face team. All rights reserved.\n\n                                Apache License\n                        Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\nTERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n1. Definitions.\n\n    \"License\" shall mean the terms and conditions for use, reproduction,\n    and distribution as defined by Sections 1 through 9 of this document.\n\n    \"Licensor\" shall mean the copyright owner or entity authorized by\n    the copyright owner that is granting the License.\n\n    \"Legal Entity\" shall mean the union of the acting entity and all\n    other entities that control, are controlled by, or are under common\n    control with that entity. For the purposes of this definition,\n    \"control\" means (i) the power, direct or indirect, to cause the\n    direction or management of such entity, whether by contract or\n    otherwise, or (ii) ownership of fifty percent (50%) or more of the\n    outstanding shares, or (iii) beneficial ownership of such entity.\n\n    \"You\" (or \"Your\") shall mean an individual or Legal Entity\n    exercising permissions granted by this License.\n\n    \"Source\" form shall mean the preferred form for making modifications,\n    including but not limited to software source code, documentation\n    source, and configuration files.\n\n    \"Object\" form shall mean any form resulting from mechanical\n    transformation or translation of a Source form, including but\n    not limited to compiled object code, generated documentation,\n    and conversions to other media types.\n\n    \"Work\" shall mean the work of authorship, whether in Source or\n    Object form, made available under the License, as indicated by a\n    copyright notice that is included in or attached to the work\n    (an example is provided in the Appendix below).\n\n    \"Derivative Works\" shall mean any work, whether in Source or Object\n    form, that is based on (or derived from) the Work and for which the\n    editorial revisions, annotations, elaborations, or other modifications\n    represent, as a whole, an original work of authorship. For the purposes\n    of this License, Derivative Works shall not include works that remain\n    separable from, or merely link (or bind by name) to the interfaces of,\n    the Work and Derivative Works thereof.\n\n    \"Contribution\" shall mean any work of authorship, including\n    the original version of the Work and any modifications or additions\n    to that Work or Derivative Works thereof, that is intentionally\n    submitted to Licensor for inclusion in the Work by the copyright owner\n    or by an individual or Legal Entity authorized to submit on behalf of\n    the copyright owner. For the purposes of this definition, \"submitted\"\n    means any form of electronic, verbal, or written communication sent\n    to the Licensor or its representatives, including but not limited to\n    communication on electronic mailing lists, source code control systems,\n    and issue tracking systems that are managed by, or on behalf of, the\n    Licensor for the purpose of discussing and improving the Work, but\n    excluding communication that is conspicuously marked or otherwise\n    designated in writing by the copyright owner as \"Not a Contribution.\"\n\n    \"Contributor\" shall mean Licensor and any individual or Legal Entity\n    on behalf of whom a Contribution has been received by Licensor and\n    subsequently incorporated within the Work.\n\n2. Grant of Copyright License. Subject to the terms and conditions of\n    this License, each Contributor hereby grants to You a perpetual,\n    worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n    copyright license to reproduce, prepare Derivative Works of,\n    publicly display, publicly perform, sublicense, and distribute the\n    Work and such Derivative Works in Source or Object form.\n\n3. Grant of Patent License. Subject to the terms and conditions of\n    this License, each Contributor hereby grants to You a perpetual,\n    worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n    (except as stated in this section) patent license to make, have made,\n    use, offer to sell, sell, import, and otherwise transfer the Work,\n    where such license applies only to those patent claims licensable\n    by such Contributor that are necessarily infringed by their\n    Contribution(s) alone or by combination of their Contribution(s)\n    with the Work to which such Contribution(s) was submitted. If You\n    institute patent litigation against any entity (including a\n    cross-claim or counterclaim in a lawsuit) alleging that the Work\n    or a Contribution incorporated within the Work constitutes direct\n    or contributory patent infringement, then any patent licenses\n    granted to You under this License for that Work shall terminate\n    as of the date such litigation is filed.\n\n4. Redistribution. You may reproduce and distribute copies of the\n    Work or Derivative Works thereof in any medium, with or without\n    modifications, and in Source or Object form, provided that You\n    meet the following conditions:\n\n    (a) You must give any other recipients of the Work or\n        Derivative Works a copy of this License; and\n\n    (b) You must cause any modified files to carry prominent notices\n        stating that You changed the files; and\n\n    (c) You must retain, in the Source form of any Derivative Works\n        that You distribute, all copyright, patent, trademark, and\n        attribution notices from the Source form of the Work,\n        excluding those notices that do not pertain to any part of\n        the Derivative Works; and\n\n    (d) If the Work includes a \"NOTICE\" text file as part of its\n        distribution, then any Derivative Works that You distribute must\n        include a readable copy of the attribution notices contained\n        within such NOTICE file, excluding those notices that do not\n        pertain to any part of the Derivative Works, in at least one\n        of the following places: within a NOTICE text file distributed\n        as part of the Derivative Works; within the Source form or\n        documentation, if provided along with the Derivative Works; or,\n        within a display generated by the Derivative Works, if and\n        wherever such third-party notices normally appear. The contents\n        of the NOTICE file are for informational purposes only and\n        do not modify the License. You may add Your own attribution\n        notices within Derivative Works that You distribute, alongside\n        or as an addendum to the NOTICE text from the Work, provided\n        that such additional attribution notices cannot be construed\n        as modifying the License.\n\n    You may add Your own copyright statement to Your modifications and\n    may provide additional or different license terms and conditions\n    for use, reproduction, or distribution of Your modifications, or\n    for any such Derivative Works as a whole, provided Your use,\n    reproduction, and distribution of the Work otherwise complies with\n    the conditions stated in this License.\n\n5. Submission of Contributions. Unless You explicitly state otherwise,\n    any Contribution intentionally submitted for inclusion in the Work\n    by You to the Licensor shall be under the terms and conditions of\n    this License, without any additional terms or conditions.\n    Notwithstanding the above, nothing herein shall supersede or modify\n    the terms of any separate license agreement you may have executed\n    with Licensor regarding such Contributions.\n\n6. Trademarks. This License does not grant permission to use the trade\n    names, trademarks, service marks, or product names of the Licensor,\n    except as required for reasonable and customary use in describing the\n    origin of the Work and reproducing the content of the NOTICE file.\n\n7. Disclaimer of Warranty. Unless required by applicable law or\n    agreed to in writing, Licensor provides the Work (and each\n    Contributor provides its Contributions) on an \"AS IS\" BASIS,\n    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n    implied, including, without limitation, any warranties or conditions\n    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n    PARTICULAR PURPOSE. You are solely responsible for determining the\n    appropriateness of using or redistributing the Work and assume any\n    risks associated with Your exercise of permissions under this License.\n\n8. Limitation of Liability. In no event and under no legal theory,\n    whether in tort (including negligence), contract, or otherwise,\n    unless required by applicable law (such as deliberate and grossly\n    negligent acts) or agreed to in writing, shall any Contributor be\n    liable to You for damages, including any direct, indirect, special,\n    incidental, or consequential damages of any character arising as a\n    result of this License or out of the use or inability to use the\n    Work (including but not limited to damages for loss of goodwill,\n    work stoppage, computer failure or malfunction, or any and all\n    other commercial damages or losses), even if such Contributor\n    has been advised of the possibility of such damages.\n\n9. Accepting Warranty or Additional Liability. While redistributing\n    the Work or Derivative Works thereof, You may choose to offer,\n    and charge a fee for, acceptance of support, warranty, indemnity,\n    or other liability obligations and/or rights consistent with this\n    License. However, in accepting such obligations, You may act only\n    on Your own behalf and on Your sole responsibility, not on behalf\n    of any other Contributor, and only if You agree to indemnify,\n    defend, and hold each Contributor harmless for any liability\n    incurred by, or claims asserted against, such Contributor by reason\n    of your accepting any such warranty or additional liability.\n\nEND OF TERMS AND CONDITIONS\n\nAPPENDIX: How to apply the Apache License to your work.\n\n    To apply the Apache License to your work, attach the following\n    boilerplate notice, with the fields enclosed by brackets \"[]\"\n    replaced with your own identifying information. (Don't include\n    the brackets!)  The text should be enclosed in the appropriate\n    comment syntax for the file format. We also recommend that a\n    file or class name and description of purpose be included on the\n    same \"printed page\" as the copyright notice for easier\n    identification within third-party archives.\n\nCopyright [yyyy] [name of copyright owner]\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n    http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License."
+  },
+  {
+    "objectID": "TODO.html",
+    "href": "TODO.html",
+    "title": "todo list",
+    "section": "",
+    "text": "[] Validation of parameters for combinations that won’t work\n\n\n\n\nFSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203\nadamw_bnb_8bit doesn’t play well with FSDP offload"
+  },
+  {
+    "objectID": "TODO.html#things-that-are-known-not-to-work",
+    "href": "TODO.html#things-that-are-known-not-to-work",
+    "title": "todo list",
+    "section": "",
+    "text": "FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203\nadamw_bnb_8bit doesn’t play well with FSDP offload"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html",
+    "title": "Setting up",
+    "section": "",
+    "text": "import torch\n# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\nassert (torch.cuda.is_available()==True)\n!pip install --no-build-isolation axolotl[deepspeed]"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#hugging-face-login-optional",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#hugging-face-login-optional",
+    "title": "Setting up",
+    "section": "Hugging Face login (optional)",
+    "text": "Hugging Face login (optional)\n\nfrom huggingface_hub import notebook_login\nnotebook_login()"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#example-configuration",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#example-configuration",
+    "title": "Setting up",
+    "section": "Example configuration",
+    "text": "Example configuration\n\nimport yaml\n\nyaml_string = \"\"\"\nbase_model: NousResearch/Meta-Llama-3.1-8B\n\nload_in_8bit: false\nload_in_4bit: true\nstrict: false\n\ndatasets:\n  - path: tatsu-lab/alpaca\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.05\noutput_dir: ./outputs/lora-out\n\nsequence_len: 2048\nsample_packing: true\neval_sample_packing: true\npad_to_sequence_len: true\n\nadapter: qlora\nlora_model_dir:\nlora_r: 32\nlora_alpha: 16\nlora_dropout: 0.05\nlora_target_linear: true\nlora_fan_in_fan_out:\nlora_modules_to_save:\n  - embed_tokens\n  - lm_head\n\nwandb_project:\nwandb_entity:\nwandb_watch:\nwandb_name:\nwandb_log_model:\n\ngradient_accumulation_steps: 2\nmicro_batch_size: 1\nnum_epochs: 1\noptimizer: paged_adamw_8bit\nlr_scheduler: cosine\nlearning_rate: 2e-5\n\ntrain_on_inputs: false\ngroup_by_length: false\nbf16: auto\nfp16:\ntf32: false\n\ngradient_checkpointing: true\nearly_stopping_patience:\nresume_from_checkpoint:\nlogging_steps: 1\nxformers_attention:\nflash_attention: false\nsdp_attention: true\n\nwarmup_steps: 1\nmax_steps: 25\nevals_per_epoch: 1\neval_table_size:\nsaves_per_epoch: 1\ndebug:\ndeepspeed:\nweight_decay: 0.0\nfsdp:\nfsdp_config:\nspecial_tokens:\n  pad_token: &lt;|end_of_text|&gt;\n\"\"\"\n\n\n# Convert the YAML string to a Python dictionary\nyaml_dict = yaml.safe_load(yaml_string)\n\n# Specify your file path\nfile_path = 'test_axolotl.yaml'\n\n# Write the YAML file\nwith open(file_path, 'w') as file:\n    yaml.dump(yaml_dict, file)\n\nAbove we have a configuration file with base LLM model and datasets specified, among many other things. Axolotl can automatically detect whether the specified datasets are on HuggingFace repo or local machine.\nThe Axolotl configuration options encompass model and dataset selection, data pre-processing, and training. Let’s go through them line by line:\n\n“base model”: String value, specifies the underlying pre-trained LLM that will be used for finetuning\n\nNext we have options for model weights quantization. Quantization allows for reduction in occupied memory on GPUs.\n\n“load_in_8bit”: Boolean value, whether to quantize the model weights into 8-bit integer.\n“load_in_4bit”: Boolean value, whether to quantize the model weights into 4-bit integer.\n“strict”: Boolean value. If false, it allows for overriding established configuration options in the yaml file when executing in command-line interface.\n“datasets”: a list of dicts that contain path and type of data sets as well as other optional configurations where datasets are concerned. Supports multiple datasets.\n“val_set_size”: Either a float value less than one or an integer less than the total size of dataset. Sets the size of validation set from the whole dataset. If float, sets the proportion of the dataset assigned for validation. If integer, sets the direct size of validation set.\n“output_dir”: String value. Path of trained model.\n\nFor data preprocessing:\n\n“sequence_len”: Integer. Specifies the maximum sequence length of the input. Typically 2048 or less.\n“pad_to_sequence_len”: Boolean. Padding input to maximum sequence length.\n“sample_packing”: Boolean. Specifies whether to use multi-packing with block diagonal attention.\n“special_tokens”: Python dict, optional. Allows users to specify the additional special tokens to be ignored by the tokenizer.\n\nFor LoRA configuration and its hyperparamters:\n\n“adapter”: String. Either “lora” or “qlora”, depending on user’s choice.\n“lora_model_dir”: String, Optional. Path to directory that contains LoRA model, if there is already a trained LoRA model the user would like to use.\n“lora_r”: Integer. Refers to the rank of LoRA decomposition matrices. Higher value will reduce LoRA efficiency. Recommended to be set to 8.\n“lora_alpha”: Integer. Scale the weight matrices by \\(\\frac{\\text{lora_alpha}}{\\text{lora_r}}\\)Recommended to be fixed at 16.\n“lora_dropout”: Float that is 1 or less. The dropout probability of a lora layer.\n“lora_target_linear”: Boolean. If true, lora will target all linear modules in the transformers architecture.\n“lora_modules_to_save”: If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n\nSee LoRA for detailed explanation of LoRA implementation.\nFor the training configurations:\n\n“gradient_accumulation_steps”: Integer. The number of steps over which to accumulate gradient for batch training. E.g. if 2, backprop is performed every two steps.\n“micro_batch_size”: Integer. Batch size per gpu / gradient_accumulation_steps\n“num_epochs”: Integer. Number of epochs. One epoch is when training has looped over every batch in the whole data set once.\n“optimizer”: The optimizer to use for the training.\n“learning_rate”: The learning rate.\n“lr_scheduler”: The learning rate scheduler to use for adjusting learning rate during training.\n“train_on_inputs”: Boolean. Whether to ignore or include the user’s prompt from the training labels.\n“group_by_length”: Boolean. Whether to group similarly sized data to minimize padding.\n“bf16”: Either “auto”, “true”, or “false”. Whether to use CUDA bf16 floating point format. If set to “auto”, will automatically apply bf16 should the gpu supports it.\n“fp16”: Optional. Specifies whether to use CUDA fp16. Automatically set to true if “bf16” is set to true. Otherwise false.\n“tf32”: Boolean. Whether to use CUDA tf32. Will override bf16.\n“gradient_checkpointing”: Boolean. Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\n“gradient_checkpointing_kwargs”: Python Dict. Fed into the trainer.\n“logging_steps”: Integer. Log training information over every specified number of steps.\n“flash_attention”: Boolean. Whether to use the flash attention mechanism.\n“sdp_attention”: Boolean. Whether to use the Scaled Dot Product attention mechanism (the attention mechanism in the original implementation of transformers.)\n“warmup_steps”: Integer. The number of pre-training steps where a very low learning rate is used.\n“evals_per_epoch”: Integer. Number of evaluations to be performed within one training epoch.\n“saves_per_epoch”: Integer. Number of times the model is saved in one training epoch.\n“weight_decay”: Positive Float. Sets the “strength” of weight decay (i.e. setting the coefficient of L2 regularization)\n\nThe above is but a snippet aiming to get users familiarized with the types of streamlined configuration options axolotl provides. For a full list of configuration options, see here\nTrain the model\n\n!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml\n\nPredict with trained model\n\n!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n    --lora_model_dir=\"./outputs/lora-out\" --gradio"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#deeper-dive",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#deeper-dive",
+    "title": "Setting up",
+    "section": "Deeper Dive",
+    "text": "Deeper Dive\nIt is also helpful to gain some familiarity over some of the core inner workings of axolotl"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#configuration-normalization",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#configuration-normalization",
+    "title": "Setting up",
+    "section": "Configuration Normalization",
+    "text": "Configuration Normalization\nAxolotl uses a custom Dict class, called DictDefault to store configurations specified in the yaml configuration file (into a Python variable named cfg). The definition for this custom Dict can be found in the utils/dict.py\nDictDefault is amended such that calling a missing key from it will result in a None return type. This is important because if some configuration options aren’t specified by the user, the None type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out utils/config/init.py"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#loading-models-tokenizers-and-trainer",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#loading-models-tokenizers-and-trainer",
+    "title": "Setting up",
+    "section": "Loading Models, Tokenizers, and Trainer",
+    "text": "Loading Models, Tokenizers, and Trainer\nIf we inspect cli.train.py, we will find that most of the heavy lifting were done by the function train() which is itself imported from src/axolotl/train.py.\ntrain() takes care of loading the appropriate tokenizer and pre-trained model through load_model() and load_tokenizer() from src/axolotl/utils/models.py respectively.\nload_tokenizer() loads in the appropriate tokenizer given the desired model, as well as chat templates.\nModelLoader class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ModelLoader will utilize the corresponding “attention hijacking” script. For example, if the user specified the base model to be NousResearch/Meta-Llama-3.1-8B, which is of llama type, and set flash_attn to True, ModelLoader will load in llama_attn_hijack_flash.py. For a list of supported attention hijacking, please refer to the directory /src/axolotl/monkeypatch/\nAnother important operation encompassed in train() is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of setup_trainer() from /src/axolotl/utils/trainer.py, which in turn relies on modules from /src/axolotl/core/trainer_builder.py. trainer_builder.py provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning (‘dpo’, ‘ipo’, ‘kto’) )"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#monkey-patch",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#monkey-patch",
+    "title": "Setting up",
+    "section": "Monkey patch",
+    "text": "Monkey patch\nThe Monkey patch directory is where model architecture/optimization patching scripts are stored (these are modifications that are not implemented in the official releases, hence the name monkey patch). It includes attention jacking, ReLoRA, and unsloth optimization."
+  },
+  {
+    "objectID": "docs/unsloth.html",
+    "href": "docs/unsloth.html",
+    "title": "Unsloth",
+    "section": "",
+    "text": "Overview\nUnsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over standard industry baselines.\n\n\nInstallation\nThe following will install the correct unsloth and extras from source.\npython scripts/unsloth_install.py | sh\n\n\nUsing unsloth w Axolotl\nAxolotl exposes a few configuration options to try out unsloth and get most of the performance gains.\nOur unsloth integration is currently limited to the following model architectures: - llama\nThese options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning\nunsloth_lora_mlp: true\nunsloth_lora_qkv: true\nunsloth_lora_o: true\nThese options are composable and can be used with multi-gpu finetuning\nunsloth_cross_entropy_loss: true\nunsloth_rms_norm: true\nunsloth_rope: true\n\n\nLimitations\n\nSingle GPU only; e.g. no multi-gpu support\nNo deepspeed or FSDP support (requires multi-gpu)\nLoRA + QLoRA support only. No full fine tunes or fp8 support.\nLimited model architecture support. Llama, Phi, Gemma, Mistral only\nNo MoE support.",
     "crumbs": [
       "How-To Guides",
-      "Mac M-series"
+      "Unsloth"
     ]
   },
   {
-    "objectID": "docs/multimodal.html",
-    "href": "docs/multimodal.html",
-    "title": "MultiModal / Vision Language Models (BETA)",
+    "objectID": "docs/multi-node.html",
+    "href": "docs/multi-node.html",
+    "title": "Multi Node",
     "section": "",
-    "text": "MultiModal / Vision Language Models (BETA)\n\nSupported Models\n\nMllama, i.e. llama with vision models\n\n\n\nUsage\nCurrently multimodal support is limited and doesn’t have full feature parity. To finetune a multimodal Llama w/ LoRA, you’ll need to use the following in YAML in combination with the rest of the required hyperparams.\nbase_model: alpindale/Llama-3.2-11B-Vision-Instruct\nprocessor_type: AutoProcessor\nskip_prepare_dataset: true\n\nchat_template: llama3_2_vision\ndatasets:\n  - path: HuggingFaceH4/llava-instruct-mix-vsft\n    type: chat_template\n    split: train[:1%]\n    field_messages: messages\nremove_unused_columns: false\nsample_packing: false\n\n# only finetune the Language model, leave the vision model and vision tower frozen\nlora_target_modules: 'language_model.model.layers.[\\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'"
+    "text": "You will need to create a configuration for accelerate, either by using accelerate config and follow the instructions or you can use one of the preset below:\n~/.cache/huggingface/accelerate/default_config.yaml\nConfigure your model to use FSDP with for example:",
+    "crumbs": [
+      "How-To Guides",
+      "Multi Node"
+    ]
   },
   {
-    "objectID": "docs/batch_vs_grad.html",
-    "href": "docs/batch_vs_grad.html",
-    "title": "Batch size vs Gradient accumulation",
-    "section": "",
-    "text": "Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn’t significantly impact learning.\nThis method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here’s why:\n\nMemory Consumption with Batch Size: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.\nGradient Accumulation: With gradient accumulation, you’re effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you’re only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.\n\nExample 1: Micro batch size: 3 Gradient accumulation steps: 2 Number of GPUs: 3 Total batch size = 3 * 2 * 3 = 18\n| GPU 1          | GPU 2          | GPU 3          |\n|----------------|----------------|----------------|\n| S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |\n| e1, e2, e3     | e4, e5, e6     | e7, e8, e9     |\n|----------------|----------------|----------------|\n| → (accumulate) | → (accumulate) | → (accumulate) |\n|----------------|----------------|----------------|\n| S10, S11, S12  | S13, S14, S15  | S16, S17, S18  |\n| e10, e11, e12  | e13, e14, e15  | e16, e17, e18  |\n|----------------|----------------|----------------|\n| → (apply)      | → (apply)      | → (apply)      |\n\nAccumulated gradient for the weight w1 after the second iteration (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18\n\nWeight update for w1:\nw1_new = w1_old - learning rate x (Total gradient for w1 / 18)\nExample 2: Micro batch size: 2 Gradient accumulation steps: 1 Number of GPUs: 3 Total batch size = 2 * 1 * 3 = 6\n| GPU 1     | GPU 2     | GPU 3     |\n|-----------|-----------|-----------|\n| S1, S2    | S3, S4    | S5, S6    |\n| e1, e2    | e3, e4    | e5, e6    |\n|-----------|-----------|-----------|\n| → (apply) | → (apply) | → (apply) |\n\nAccumulated gradient for the weight w1 (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6\n\nWeight update for w1:\nw1_new = w1_old - learning rate × (Total gradient for w1 / 6)"
+    "objectID": "docs/multi-node.html#machine-configuration",
+    "href": "docs/multi-node.html#machine-configuration",
+    "title": "Multi Node",
+    "section": "Machine configuration",
+    "text": "Machine configuration\nOn each machine you need a copy of Axolotl, we suggest using the same commit to ensure compatibility.\nYou will also need to have the same configuration file for your model on each machine.\nOn the main machine only, make sure the port you set as main_process_port is open in TCP and reachable by other machines.\nAll you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.",
+    "crumbs": [
+      "How-To Guides",
+      "Multi Node"
+    ]
   },
   {
-    "objectID": "docs/dataset_preprocessing.html",
-    "href": "docs/dataset_preprocessing.html",
-    "title": "Dataset Preprocessing",
+    "objectID": "docs/faq.html",
+    "href": "docs/faq.html",
+    "title": "FAQ",
     "section": "",
-    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside the (dataset format)[../dataset-formats/] and prompt strategies to: - parse the dataset based on the dataset format - transform the dataset to how you would interact with the model based on the prompt strategy - tokenize the dataset based on the configured model & tokenizer - shuffle and merge multiple datasets together if using more than one\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling python -m axolotl.cli.preprocess /path/to/your.yaml --debug\nWhen training is started\n\nWhat are the benefits of pre-processing? When training interactively or for sweeps (e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly slow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent training parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example YAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a default path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly setting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed data is in the cache.\nWhat are the edge cases? Let’s say you are writing a custom prompt strategy or using a user-defined prompt template. Because the trainer cannot readily detect these changes, we cannot change the calculated hash value for the pre-processed dataset. If you have dataset_prepared_path: ... set and change your prompt templating logic, it may not pick up the changes you made and you will be training over the old prompt."
+    "text": "Q: The trainer stopped and hasn’t progressed in several minutes.\n\nA: Usually an issue with the GPUs communicating with each other. See the NCCL doc\n\nQ: Exitcode -9\n\nA: This usually happens when you run out of system RAM.\n\nQ: Exitcode -7 while using deepspeed\n\nA: Try upgrading deepspeed w: pip install -U deepspeed\n\nQ: AttributeError: ‘DummyOptim’ object has no attribute ‘step’\n\nA: You may be using deepspeed with single gpu. Please don’t set deepspeed: in yaml or cli.",
+    "crumbs": [
+      "FAQ"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html",
+    "href": "docs/ray-integration.html",
+    "title": "Ray Train integration",
+    "section": "",
+    "text": "Axolotl supports using Ray as an alternative to accelerate for orchestrating training. This is especially useful for multi-node training since you only have to setup code and dependencies in a single node and launch training as if you were using a single node.\nWith the --use-ray CLI flag, Axolotl will use Ray Train’s TorchTrainer to run training.",
+    "crumbs": [
+      "How-To Guides",
+      "Ray Train integration"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html#ray-cluster-setup",
+    "href": "docs/ray-integration.html#ray-cluster-setup",
+    "title": "Ray Train integration",
+    "section": "Ray cluster setup",
+    "text": "Ray cluster setup\nA prerequisite using the Ray Train integration is to setup a Ray cluster on your desired node(s). For a detailed guide on how you can get started with ray clusters, check the official Ray docs here: https://docs.ray.io/en/latest/cluster/getting-started.html\nEvery Ray cluster has one head node and a set of worker nodes. The head node is just like any other worker node, but it also runs certain special processes related to scheduling and orchestration. Ray-enabled scripts are run on the head node and depending on the resources (number of CPUs, GPUs, etc) they request, will be scheduled to run certain tasks on the worker nodes. For more on key concepts behind a Ray cluster, you can refer this doc.",
+    "crumbs": [
+      "How-To Guides",
+      "Ray Train integration"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html#sanity-check",
+    "href": "docs/ray-integration.html#sanity-check",
+    "title": "Ray Train integration",
+    "section": "Sanity check",
+    "text": "Sanity check\nTo run a sanity check on whether your ray cluster is setup properly, execute the following on the head node:\nray status\nThe output should have a summary of your Ray cluster - list of all the nodes in your cluster, the number of CPUs and GPUs in your cluster, etc. For example, if you have a cluster with 1 CPU-only head node and 2 4xL40S worker nodes, the output can look like this:\nNode status\n---------------------------------------------------------------\nActive:\n 1 head\nIdle:\n 2 4xL40S:48CPU-384GB\nPending:\n (no pending nodes)\nRecent failures:\n (no failures)\n\nResources\n---------------------------------------------------------------\nUsage:\n 0.0/96.0 CPU\n 0.0/8.0 GPU\n 0B/800.00GiB memory\n 0B/229.57GiB object_store_memory\n\nDemands:\n (no resource demands)\nYou should also be able to see the same on the Ray dashboard.",
+    "crumbs": [
+      "How-To Guides",
+      "Ray Train integration"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html#configuring-training-with-ray-train",
+    "href": "docs/ray-integration.html#configuring-training-with-ray-train",
+    "title": "Ray Train integration",
+    "section": "Configuring training with Ray Train",
+    "text": "Configuring training with Ray Train\nYou can find an example configuration at configs/llama-3/lora-1b-ray.yaml.\nThe key parameters to note here are:\n...\nuse_ray: true\nray_num_workers: 4\n# optional\nresources_per_worker:\n    GPU: 1\n...\n\nuse_ray: This is the flag that enables the Ray Train integration. You can either use the corresponding --use-ray flag in the CLI or set use_ray in the config file.\nray_num_workers: This is the number of workers/GPUs to use for training.\nresources_per_worker: This is the Ray resource request for each worker. This can be used to request a specific GPU type or a custom resource for each worker. For example, if your ray cluster has GPUs of different types, and you only want to use NVIDIA L40S GPUs, you can do\n\nresources_per_worker:\n    accelerator_type:L40S: 0.001",
+    "crumbs": [
+      "How-To Guides",
+      "Ray Train integration"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html#launching-training",
+    "href": "docs/ray-integration.html#launching-training",
+    "title": "Ray Train integration",
+    "section": "Launching training",
+    "text": "Launching training\nYou can simply run the following command on the head node:\naxolotl train examples/llama-3/lora-1b-ray.yml --use-ray\nThis will launch training on the head node and workers will be scheduled automatically by Ray Train to run on the appropriate head or worker nodes.\nYou can also monitor training progress on the Ray dashboard.\nComing back to the example on a Ray cluster with 1 head node and 2 4xL40S worker nodes, let’s say you want to make use of all 8 GPUs. You would be able to just set ray_num_workers: 8 and run the previous command. The Cluster tab will show the following:\n\n\n\nRay dashboard",
+    "crumbs": [
+      "How-To Guides",
+      "Ray Train integration"
+    ]
   },
   {
     "objectID": "docs/fsdp_qlora.html",
diff --git a/sitemap.xml b/sitemap.xml
index f9065e73d..8926366cb 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,126 +2,130 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/FAQS.html</loc>
-    <lastmod>2025-01-29T05:08:44.746Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.204Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/stepwise_supervised.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/pretraining.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/tokenized.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/torchao.html</loc>
-    <lastmod>2025-01-29T05:08:44.749Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.210Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/reward_modelling.html</loc>
-    <lastmod>2025-01-29T05:08:44.749Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.210Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/input_output.html</loc>
-    <lastmod>2025-01-29T05:08:44.749Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.209Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/config.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/lr_groups.html</loc>
-    <lastmod>2025-01-29T05:08:44.749Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.209Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
-  </url>
-  <url>
-    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
-  </url>
-  <url>
-    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html</loc>
-    <lastmod>2025-01-29T05:08:44.749Z</lastmod>
-  </url>
-  <url>
-    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/unsloth.html</loc>
-    <lastmod>2025-01-29T05:08:44.749Z</lastmod>
-  </url>
-  <url>
-    <loc>https://axolotl-ai-cloud.github.io/axolotl/examples/colab-notebooks/colab-axolotl-example.html</loc>
-    <lastmod>2025-01-29T05:08:44.750Z</lastmod>
-  </url>
-  <url>
-    <loc>https://axolotl-ai-cloud.github.io/axolotl/TODO.html</loc>
-    <lastmod>2025-01-29T05:08:44.746Z</lastmod>
-  </url>
-  <url>
-    <loc>https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
-    <lastmod>2025-01-29T05:08:44.763Z</lastmod>
-  </url>
-  <url>
-    <loc>https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/LICENSE.html</loc>
-    <lastmod>2025-01-29T05:08:44.763Z</lastmod>
-  </url>
-  <url>
-    <loc>https://axolotl-ai-cloud.github.io/axolotl/index.html</loc>
-    <lastmod>2025-01-29T05:08:44.760Z</lastmod>
-  </url>
-  <url>
-    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/mac.html</loc>
-    <lastmod>2025-01-29T05:08:44.749Z</lastmod>
-  </url>
-  <url>
-    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/multimodal.html</loc>
-    <lastmod>2025-01-29T05:08:44.749Z</lastmod>
-  </url>
-  <url>
-    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/batch_vs_grad.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset_preprocessing.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/batch_vs_grad.html</loc>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/multimodal.html</loc>
+    <lastmod>2025-01-29T05:10:30.209Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/mac.html</loc>
+    <lastmod>2025-01-29T05:10:30.209Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/index.html</loc>
+    <lastmod>2025-01-29T05:10:30.223Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/LICENSE.html</loc>
+    <lastmod>2025-01-29T05:10:30.226Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
+    <lastmod>2025-01-29T05:10:30.226Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/TODO.html</loc>
+    <lastmod>2025-01-29T05:10:30.204Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/examples/colab-notebooks/colab-axolotl-example.html</loc>
+    <lastmod>2025-01-29T05:10:30.210Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/unsloth.html</loc>
+    <lastmod>2025-01-29T05:10:30.210Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html</loc>
+    <lastmod>2025-01-29T05:10:30.209Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html</loc>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
+  </url>
+  <url>
+    <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/ray-integration.html</loc>
+    <lastmod>2025-01-29T05:10:30.210Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/fsdp_qlora.html</loc>
-    <lastmod>2025-01-29T05:08:44.748Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/rlhf.html</loc>
-    <lastmod>2025-01-29T05:08:44.749Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.210Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html</loc>
-    <lastmod>2025-01-29T05:08:44.749Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.209Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/nccl.html</loc>
-    <lastmod>2025-01-29T05:08:44.749Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.210Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/amd_hpc.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.205Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/template_free.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/index.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html</loc>
-    <lastmod>2025-01-29T05:08:44.747Z</lastmod>
+    <lastmod>2025-01-29T05:10:30.206Z</lastmod>
   </url>
 </urlset>
diff --git a/src/axolotl/integrations/LICENSE.html b/src/axolotl/integrations/LICENSE.html
index e22db7249..453ecde07 100644
--- a/src/axolotl/integrations/LICENSE.html
+++ b/src/axolotl/integrations/LICENSE.html
@@ -181,6 +181,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="../../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
index b3e97505e..3ac82371c 100644
--- a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
+++ b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
@@ -181,6 +181,12 @@ ul.task-list li input[type="checkbox"] {
   <a href="../../../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train integration</span></a>
+  </div>
 </li>
       </ul>
   </li>