diff --git a/.nojekyll b/.nojekyll
index fbce3de79..7fbf2d225 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-b3ac09e6
\ No newline at end of file
+f15910f8
\ No newline at end of file
diff --git a/FAQS.html b/FAQS.html
index 3b78fce2c..2eee45fc5 100644
--- a/FAQS.html
+++ b/FAQS.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="./docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/TODO.html b/TODO.html
index 61f32f31d..096e1fdf1 100644
--- a/TODO.html
+++ b/TODO.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="./docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/amd_hpc.html b/docs/amd_hpc.html
index 3f08a00d8..069fd01ff 100644
--- a/docs/amd_hpc.html
+++ b/docs/amd_hpc.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.args.html b/docs/api/cli.args.html
index 774f3cf67..1057b5cc0 100644
--- a/docs/api/cli.args.html
+++ b/docs/api/cli.args.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.art.html b/docs/api/cli.art.html
index 04cbd59d5..ef88561c7 100644
--- a/docs/api/cli.art.html
+++ b/docs/api/cli.art.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.checks.html b/docs/api/cli.checks.html
index a5952fa95..993452a90 100644
--- a/docs/api/cli.checks.html
+++ b/docs/api/cli.checks.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.cloud.base.html b/docs/api/cli.cloud.base.html
index 8402dd58c..467326426 100644
--- a/docs/api/cli.cloud.base.html
+++ b/docs/api/cli.cloud.base.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.cloud.modal_.html b/docs/api/cli.cloud.modal_.html
index 08e13c58a..3e06a0f63 100644
--- a/docs/api/cli.cloud.modal_.html
+++ b/docs/api/cli.cloud.modal_.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.config.html b/docs/api/cli.config.html
index ac62c4a14..35734081e 100644
--- a/docs/api/cli.config.html
+++ b/docs/api/cli.config.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.delinearize_llama4.html b/docs/api/cli.delinearize_llama4.html
index 836802f64..5d43d9a23 100644
--- a/docs/api/cli.delinearize_llama4.html
+++ b/docs/api/cli.delinearize_llama4.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.evaluate.html b/docs/api/cli.evaluate.html
index 086ad499c..9af3a1ebf 100644
--- a/docs/api/cli.evaluate.html
+++ b/docs/api/cli.evaluate.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.inference.html b/docs/api/cli.inference.html
index 15f08d393..232ce23c3 100644
--- a/docs/api/cli.inference.html
+++ b/docs/api/cli.inference.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.main.html b/docs/api/cli.main.html
index d8cc1ca1b..6b7db5865 100644
--- a/docs/api/cli.main.html
+++ b/docs/api/cli.main.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.merge_lora.html b/docs/api/cli.merge_lora.html
index fe606c810..93c8cc982 100644
--- a/docs/api/cli.merge_lora.html
+++ b/docs/api/cli.merge_lora.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.merge_sharded_fsdp_weights.html b/docs/api/cli.merge_sharded_fsdp_weights.html
index 1930aee3b..f9d4e4929 100644
--- a/docs/api/cli.merge_sharded_fsdp_weights.html
+++ b/docs/api/cli.merge_sharded_fsdp_weights.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.preprocess.html b/docs/api/cli.preprocess.html
index 73e72f32b..1478d3f92 100644
--- a/docs/api/cli.preprocess.html
+++ b/docs/api/cli.preprocess.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.quantize.html b/docs/api/cli.quantize.html
index 4b4ae1825..470cac6b7 100644
--- a/docs/api/cli.quantize.html
+++ b/docs/api/cli.quantize.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.train.html b/docs/api/cli.train.html
index 0041bf968..095df7051 100644
--- a/docs/api/cli.train.html
+++ b/docs/api/cli.train.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.args.html b/docs/api/cli.utils.args.html
index a3ade4d0f..6cdaa2b2c 100644
--- a/docs/api/cli.utils.args.html
+++ b/docs/api/cli.utils.args.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.fetch.html b/docs/api/cli.utils.fetch.html
index 5903df671..fead4cf96 100644
--- a/docs/api/cli.utils.fetch.html
+++ b/docs/api/cli.utils.fetch.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.html b/docs/api/cli.utils.html
index 4a4d16335..5ec31b327 100644
--- a/docs/api/cli.utils.html
+++ b/docs/api/cli.utils.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.load.html b/docs/api/cli.utils.load.html
index 73dd14e14..de869806c 100644
--- a/docs/api/cli.utils.load.html
+++ b/docs/api/cli.utils.load.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.sweeps.html b/docs/api/cli.utils.sweeps.html
index 664022c0a..d75839559 100644
--- a/docs/api/cli.utils.sweeps.html
+++ b/docs/api/cli.utils.sweeps.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.utils.train.html b/docs/api/cli.utils.train.html
index 430eda0d6..0e7e7169b 100644
--- a/docs/api/cli.utils.train.html
+++ b/docs/api/cli.utils.train.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/cli.vllm_serve.html b/docs/api/cli.vllm_serve.html
index f0544a2ed..d70db682d 100644
--- a/docs/api/cli.vllm_serve.html
+++ b/docs/api/cli.vllm_serve.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/common.architectures.html b/docs/api/common.architectures.html
index c0dad43f3..5b3402f07 100644
--- a/docs/api/common.architectures.html
+++ b/docs/api/common.architectures.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/common.const.html b/docs/api/common.const.html
index 053cb61ad..4b5b27815 100644
--- a/docs/api/common.const.html
+++ b/docs/api/common.const.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/common.datasets.html b/docs/api/common.datasets.html
index f95e546d3..7aeabec3e 100644
--- a/docs/api/common.datasets.html
+++ b/docs/api/common.datasets.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/convert.html b/docs/api/convert.html
index 050e0549a..8dfd5ddc6 100644
--- a/docs/api/convert.html
+++ b/docs/api/convert.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.builders.base.html b/docs/api/core.builders.base.html
index edb2b2910..70c409da6 100644
--- a/docs/api/core.builders.base.html
+++ b/docs/api/core.builders.base.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.builders.causal.html b/docs/api/core.builders.causal.html
index 849f2adaa..07f219b91 100644
--- a/docs/api/core.builders.causal.html
+++ b/docs/api/core.builders.causal.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.builders.rl.html b/docs/api/core.builders.rl.html
index 886307e7d..2e7e1a9b9 100644
--- a/docs/api/core.builders.rl.html
+++ b/docs/api/core.builders.rl.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.chat.format.chatml.html b/docs/api/core.chat.format.chatml.html
index 12b1845be..aa751cd04 100644
--- a/docs/api/core.chat.format.chatml.html
+++ b/docs/api/core.chat.format.chatml.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.chat.format.llama3x.html b/docs/api/core.chat.format.llama3x.html
index 33eeba555..df0747c4f 100644
--- a/docs/api/core.chat.format.llama3x.html
+++ b/docs/api/core.chat.format.llama3x.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.chat.format.shared.html b/docs/api/core.chat.format.shared.html
index d94a5ab2b..49ca3ddbc 100644
--- a/docs/api/core.chat.format.shared.html
+++ b/docs/api/core.chat.format.shared.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.chat.messages.html b/docs/api/core.chat.messages.html
index 43bd8fff7..8a8ecd777 100644
--- a/docs/api/core.chat.messages.html
+++ b/docs/api/core.chat.messages.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.datasets.chat.html b/docs/api/core.datasets.chat.html
index 2cb53ae55..9c28a9e13 100644
--- a/docs/api/core.datasets.chat.html
+++ b/docs/api/core.datasets.chat.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.datasets.transforms.chat_builder.html b/docs/api/core.datasets.transforms.chat_builder.html
index 459193cea..b5a19c677 100644
--- a/docs/api/core.datasets.transforms.chat_builder.html
+++ b/docs/api/core.datasets.transforms.chat_builder.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.base.html b/docs/api/core.trainers.base.html
index 58672a15c..ca1a656e8 100644
--- a/docs/api/core.trainers.base.html
+++ b/docs/api/core.trainers.base.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.dpo.trainer.html b/docs/api/core.trainers.dpo.trainer.html
index fe4dc8165..04caf115c 100644
--- a/docs/api/core.trainers.dpo.trainer.html
+++ b/docs/api/core.trainers.dpo.trainer.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.grpo.sampler.html b/docs/api/core.trainers.grpo.sampler.html
index 34fc1dee2..bbd2ae0bd 100644
--- a/docs/api/core.trainers.grpo.sampler.html
+++ b/docs/api/core.trainers.grpo.sampler.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.grpo.trainer.html b/docs/api/core.trainers.grpo.trainer.html
index 075322fcf..7c35fb777 100644
--- a/docs/api/core.trainers.grpo.trainer.html
+++ b/docs/api/core.trainers.grpo.trainer.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.mamba.html b/docs/api/core.trainers.mamba.html
index 2b30ace15..7b87a0b5d 100644
--- a/docs/api/core.trainers.mamba.html
+++ b/docs/api/core.trainers.mamba.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.mixins.optimizer.html b/docs/api/core.trainers.mixins.optimizer.html
index 720af079f..be1aa4bbb 100644
--- a/docs/api/core.trainers.mixins.optimizer.html
+++ b/docs/api/core.trainers.mixins.optimizer.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.mixins.rng_state_loader.html b/docs/api/core.trainers.mixins.rng_state_loader.html
index 2da6c7f6a..da5b395c8 100644
--- a/docs/api/core.trainers.mixins.rng_state_loader.html
+++ b/docs/api/core.trainers.mixins.rng_state_loader.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.mixins.scheduler.html b/docs/api/core.trainers.mixins.scheduler.html
index 43b7606e4..fbbc0c7b8 100644
--- a/docs/api/core.trainers.mixins.scheduler.html
+++ b/docs/api/core.trainers.mixins.scheduler.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.trl.html b/docs/api/core.trainers.trl.html
index 76e761105..7507abcc2 100644
--- a/docs/api/core.trainers.trl.html
+++ b/docs/api/core.trainers.trl.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.trainers.utils.html b/docs/api/core.trainers.utils.html
index cbc224546..ab6ba0826 100644
--- a/docs/api/core.trainers.utils.html
+++ b/docs/api/core.trainers.utils.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/core.training_args.html b/docs/api/core.training_args.html
index d97409560..fc14c2e0b 100644
--- a/docs/api/core.training_args.html
+++ b/docs/api/core.training_args.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/datasets.html b/docs/api/datasets.html
index 38305d88a..17eb88cdb 100644
--- a/docs/api/datasets.html
+++ b/docs/api/datasets.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/evaluate.html b/docs/api/evaluate.html
index 71166c395..869fc9e0e 100644
--- a/docs/api/evaluate.html
+++ b/docs/api/evaluate.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/index.html b/docs/api/index.html
index 860b75a1c..272b371d6 100644
--- a/docs/api/index.html
+++ b/docs/api/index.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.base.html b/docs/api/integrations.base.html
index ac2f8ed5c..79f9d92f4 100644
--- a/docs/api/integrations.base.html
+++ b/docs/api/integrations.base.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.cut_cross_entropy.args.html b/docs/api/integrations.cut_cross_entropy.args.html
index afc1e1da5..448a8974d 100644
--- a/docs/api/integrations.cut_cross_entropy.args.html
+++ b/docs/api/integrations.cut_cross_entropy.args.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.grokfast.optimizer.html b/docs/api/integrations.grokfast.optimizer.html
index 2f13e5074..4932427c5 100644
--- a/docs/api/integrations.grokfast.optimizer.html
+++ b/docs/api/integrations.grokfast.optimizer.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.kd.trainer.html b/docs/api/integrations.kd.trainer.html
index f903ee091..769d3985a 100644
--- a/docs/api/integrations.kd.trainer.html
+++ b/docs/api/integrations.kd.trainer.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.liger.args.html b/docs/api/integrations.liger.args.html
index 4c3341cdc..a3ff1f7a9 100644
--- a/docs/api/integrations.liger.args.html
+++ b/docs/api/integrations.liger.args.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.lm_eval.args.html b/docs/api/integrations.lm_eval.args.html
index e33a469be..5cd39fbe1 100644
--- a/docs/api/integrations.lm_eval.args.html
+++ b/docs/api/integrations.lm_eval.args.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/integrations.spectrum.args.html b/docs/api/integrations.spectrum.args.html
index 2b1e9b1c3..90c26fe91 100644
--- a/docs/api/integrations.spectrum.args.html
+++ b/docs/api/integrations.spectrum.args.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/kernels.geglu.html b/docs/api/kernels.geglu.html
index 0fc824966..3f9dacf3a 100644
--- a/docs/api/kernels.geglu.html
+++ b/docs/api/kernels.geglu.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/kernels.lora.html b/docs/api/kernels.lora.html
index 9564bde1e..e899f51c0 100644
--- a/docs/api/kernels.lora.html
+++ b/docs/api/kernels.lora.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/kernels.quantize.html b/docs/api/kernels.quantize.html
index e6ccb1d18..f88a31a21 100644
--- a/docs/api/kernels.quantize.html
+++ b/docs/api/kernels.quantize.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/kernels.swiglu.html b/docs/api/kernels.swiglu.html
index 2e54675eb..67888779e 100644
--- a/docs/api/kernels.swiglu.html
+++ b/docs/api/kernels.swiglu.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/kernels.utils.html b/docs/api/kernels.utils.html
index 0b4f2d45a..a67d5809e 100644
--- a/docs/api/kernels.utils.html
+++ b/docs/api/kernels.utils.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.adapter.html b/docs/api/loaders.adapter.html
index 9c70d12d4..7674cd538 100644
--- a/docs/api/loaders.adapter.html
+++ b/docs/api/loaders.adapter.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.constants.html b/docs/api/loaders.constants.html
index c8a77869a..0b89383ea 100644
--- a/docs/api/loaders.constants.html
+++ b/docs/api/loaders.constants.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.model.html b/docs/api/loaders.model.html
index 68894cbcb..31c5475f0 100644
--- a/docs/api/loaders.model.html
+++ b/docs/api/loaders.model.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.patch_manager.html b/docs/api/loaders.patch_manager.html
index 2114ff0e4..3fd11c646 100644
--- a/docs/api/loaders.patch_manager.html
+++ b/docs/api/loaders.patch_manager.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.processor.html b/docs/api/loaders.processor.html
index 9a9b27944..a8f83519c 100644
--- a/docs/api/loaders.processor.html
+++ b/docs/api/loaders.processor.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/loaders.tokenizer.html b/docs/api/loaders.tokenizer.html
index d958d1063..342f502ae 100644
--- a/docs/api/loaders.tokenizer.html
+++ b/docs/api/loaders.tokenizer.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/logging_config.html b/docs/api/logging_config.html
index d75368636..47e98d625 100644
--- a/docs/api/logging_config.html
+++ b/docs/api/logging_config.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/models.mamba.modeling_mamba.html b/docs/api/models.mamba.modeling_mamba.html
index 81a1a6283..14b7bb462 100644
--- a/docs/api/models.mamba.modeling_mamba.html
+++ b/docs/api/models.mamba.modeling_mamba.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.btlm_attn_hijack_flash.html b/docs/api/monkeypatch.btlm_attn_hijack_flash.html
index 52d3fc0de..87fe82800 100644
--- a/docs/api/monkeypatch.btlm_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.btlm_attn_hijack_flash.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.data.batch_dataset_fetcher.html b/docs/api/monkeypatch.data.batch_dataset_fetcher.html
index e568f1c15..7175b91c2 100644
--- a/docs/api/monkeypatch.data.batch_dataset_fetcher.html
+++ b/docs/api/monkeypatch.data.batch_dataset_fetcher.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html b/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
index cc2d283f8..9a2ce5fdb 100644
--- a/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
+++ b/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html b/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
index b8bfcff10..2d9686a7f 100644
--- a/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
+++ b/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.llama_attn_hijack_flash.html b/docs/api/monkeypatch.llama_attn_hijack_flash.html
index f4ab19e68..60cd44889 100644
--- a/docs/api/monkeypatch.llama_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.llama_attn_hijack_flash.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.llama_attn_hijack_xformers.html b/docs/api/monkeypatch.llama_attn_hijack_xformers.html
index 38c2c96f4..1d14a6107 100644
--- a/docs/api/monkeypatch.llama_attn_hijack_xformers.html
+++ b/docs/api/monkeypatch.llama_attn_hijack_xformers.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.llama_expand_mask.html b/docs/api/monkeypatch.llama_expand_mask.html
index 2d28496f2..5cb928455 100644
--- a/docs/api/monkeypatch.llama_expand_mask.html
+++ b/docs/api/monkeypatch.llama_expand_mask.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.llama_patch_multipack.html b/docs/api/monkeypatch.llama_patch_multipack.html
index 26c05f015..8ef3d090e 100644
--- a/docs/api/monkeypatch.llama_patch_multipack.html
+++ b/docs/api/monkeypatch.llama_patch_multipack.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.lora_kernels.html b/docs/api/monkeypatch.lora_kernels.html
index 414da4a5c..e2d50112c 100644
--- a/docs/api/monkeypatch.lora_kernels.html
+++ b/docs/api/monkeypatch.lora_kernels.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.mistral_attn_hijack_flash.html b/docs/api/monkeypatch.mistral_attn_hijack_flash.html
index 1053c6cf8..ce352704e 100644
--- a/docs/api/monkeypatch.mistral_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.mistral_attn_hijack_flash.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.mixtral.html b/docs/api/monkeypatch.mixtral.html
index b8c72308f..90e2f044d 100644
--- a/docs/api/monkeypatch.mixtral.html
+++ b/docs/api/monkeypatch.mixtral.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.multipack.html b/docs/api/monkeypatch.multipack.html
index 38317e779..519d768a7 100644
--- a/docs/api/monkeypatch.multipack.html
+++ b/docs/api/monkeypatch.multipack.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.relora.html b/docs/api/monkeypatch.relora.html
index 5edea9d02..ae1c58c82 100644
--- a/docs/api/monkeypatch.relora.html
+++ b/docs/api/monkeypatch.relora.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.stablelm_attn_hijack_flash.html b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
index b22ba9a2f..2ae89309a 100644
--- a/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.trainer_fsdp_optim.html b/docs/api/monkeypatch.trainer_fsdp_optim.html
index ab8c323ea..7df3b4256 100644
--- a/docs/api/monkeypatch.trainer_fsdp_optim.html
+++ b/docs/api/monkeypatch.trainer_fsdp_optim.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.transformers_fa_utils.html b/docs/api/monkeypatch.transformers_fa_utils.html
index 6fdd093bb..f86a2342f 100644
--- a/docs/api/monkeypatch.transformers_fa_utils.html
+++ b/docs/api/monkeypatch.transformers_fa_utils.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.unsloth_.html b/docs/api/monkeypatch.unsloth_.html
index 37c129267..5ebd85f4b 100644
--- a/docs/api/monkeypatch.unsloth_.html
+++ b/docs/api/monkeypatch.unsloth_.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/monkeypatch.utils.html b/docs/api/monkeypatch.utils.html
index c40a900e4..5de17b2f9 100644
--- a/docs/api/monkeypatch.utils.html
+++ b/docs/api/monkeypatch.utils.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.alpaca_chat.html b/docs/api/prompt_strategies.alpaca_chat.html
index 4cf545cdd..6b8404ec4 100644
--- a/docs/api/prompt_strategies.alpaca_chat.html
+++ b/docs/api/prompt_strategies.alpaca_chat.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.alpaca_instruct.html b/docs/api/prompt_strategies.alpaca_instruct.html
index 70eaa8b45..6ef354c0e 100644
--- a/docs/api/prompt_strategies.alpaca_instruct.html
+++ b/docs/api/prompt_strategies.alpaca_instruct.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.alpaca_w_system.html b/docs/api/prompt_strategies.alpaca_w_system.html
index 01309a9fb..c3f27b2aa 100644
--- a/docs/api/prompt_strategies.alpaca_w_system.html
+++ b/docs/api/prompt_strategies.alpaca_w_system.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.base.html b/docs/api/prompt_strategies.base.html
index 0e215e3a6..3f61d661a 100644
--- a/docs/api/prompt_strategies.base.html
+++ b/docs/api/prompt_strategies.base.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.bradley_terry.llama3.html b/docs/api/prompt_strategies.bradley_terry.llama3.html
index c9419f55a..3b1a32b6d 100644
--- a/docs/api/prompt_strategies.bradley_terry.llama3.html
+++ b/docs/api/prompt_strategies.bradley_terry.llama3.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.chat_template.html b/docs/api/prompt_strategies.chat_template.html
index 5c60cfaa1..4727578cd 100644
--- a/docs/api/prompt_strategies.chat_template.html
+++ b/docs/api/prompt_strategies.chat_template.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.completion.html b/docs/api/prompt_strategies.completion.html
index eb318ae9e..2e4a56ccc 100644
--- a/docs/api/prompt_strategies.completion.html
+++ b/docs/api/prompt_strategies.completion.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.chat_template.html b/docs/api/prompt_strategies.dpo.chat_template.html
index 3249dd369..96d478a99 100644
--- a/docs/api/prompt_strategies.dpo.chat_template.html
+++ b/docs/api/prompt_strategies.dpo.chat_template.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.chatml.html b/docs/api/prompt_strategies.dpo.chatml.html
index 4ce3860d3..69ccfc3b8 100644
--- a/docs/api/prompt_strategies.dpo.chatml.html
+++ b/docs/api/prompt_strategies.dpo.chatml.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.llama3.html b/docs/api/prompt_strategies.dpo.llama3.html
index 8bc63226a..011c76838 100644
--- a/docs/api/prompt_strategies.dpo.llama3.html
+++ b/docs/api/prompt_strategies.dpo.llama3.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.passthrough.html b/docs/api/prompt_strategies.dpo.passthrough.html
index 4eacc3f7a..d7e04d298 100644
--- a/docs/api/prompt_strategies.dpo.passthrough.html
+++ b/docs/api/prompt_strategies.dpo.passthrough.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.user_defined.html b/docs/api/prompt_strategies.dpo.user_defined.html
index 22f67a734..d2c502f40 100644
--- a/docs/api/prompt_strategies.dpo.user_defined.html
+++ b/docs/api/prompt_strategies.dpo.user_defined.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.dpo.zephyr.html b/docs/api/prompt_strategies.dpo.zephyr.html
index f4541beb3..59eeca2cd 100644
--- a/docs/api/prompt_strategies.dpo.zephyr.html
+++ b/docs/api/prompt_strategies.dpo.zephyr.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.input_output.html b/docs/api/prompt_strategies.input_output.html
index 6c21f6a97..1ef2b78ea 100644
--- a/docs/api/prompt_strategies.input_output.html
+++ b/docs/api/prompt_strategies.input_output.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.kto.chatml.html b/docs/api/prompt_strategies.kto.chatml.html
index 2775090f3..6c907f403 100644
--- a/docs/api/prompt_strategies.kto.chatml.html
+++ b/docs/api/prompt_strategies.kto.chatml.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.kto.llama3.html b/docs/api/prompt_strategies.kto.llama3.html
index b6faf0e69..83c38766f 100644
--- a/docs/api/prompt_strategies.kto.llama3.html
+++ b/docs/api/prompt_strategies.kto.llama3.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.kto.user_defined.html b/docs/api/prompt_strategies.kto.user_defined.html
index 5174f91c8..e3eace47e 100644
--- a/docs/api/prompt_strategies.kto.user_defined.html
+++ b/docs/api/prompt_strategies.kto.user_defined.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.llama2_chat.html b/docs/api/prompt_strategies.llama2_chat.html
index 8566ac0e5..79deb7d49 100644
--- a/docs/api/prompt_strategies.llama2_chat.html
+++ b/docs/api/prompt_strategies.llama2_chat.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.messages.chat.html b/docs/api/prompt_strategies.messages.chat.html
index 1a8f142b0..b8ddbd464 100644
--- a/docs/api/prompt_strategies.messages.chat.html
+++ b/docs/api/prompt_strategies.messages.chat.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.metharme.html b/docs/api/prompt_strategies.metharme.html
index bc7c1c08b..2cb46b3b3 100644
--- a/docs/api/prompt_strategies.metharme.html
+++ b/docs/api/prompt_strategies.metharme.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.orcamini.html b/docs/api/prompt_strategies.orcamini.html
index ed1229ea6..c896ca054 100644
--- a/docs/api/prompt_strategies.orcamini.html
+++ b/docs/api/prompt_strategies.orcamini.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.orpo.chat_template.html b/docs/api/prompt_strategies.orpo.chat_template.html
index 9eb92df6f..3906f5ab4 100644
--- a/docs/api/prompt_strategies.orpo.chat_template.html
+++ b/docs/api/prompt_strategies.orpo.chat_template.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.pygmalion.html b/docs/api/prompt_strategies.pygmalion.html
index a4e945980..147bc2fcb 100644
--- a/docs/api/prompt_strategies.pygmalion.html
+++ b/docs/api/prompt_strategies.pygmalion.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.stepwise_supervised.html b/docs/api/prompt_strategies.stepwise_supervised.html
index f9a0f5b95..1fa76ec39 100644
--- a/docs/api/prompt_strategies.stepwise_supervised.html
+++ b/docs/api/prompt_strategies.stepwise_supervised.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_strategies.user_defined.html b/docs/api/prompt_strategies.user_defined.html
index 49a0bb714..29764c6f9 100644
--- a/docs/api/prompt_strategies.user_defined.html
+++ b/docs/api/prompt_strategies.user_defined.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/prompt_tokenizers.html b/docs/api/prompt_tokenizers.html
index cf4c1da22..c2598000f 100644
--- a/docs/api/prompt_tokenizers.html
+++ b/docs/api/prompt_tokenizers.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/train.html b/docs/api/train.html
index bd201ea40..5e082cfa9 100644
--- a/docs/api/train.html
+++ b/docs/api/train.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.bench.html b/docs/api/utils.bench.html
index dfa9be7b7..e3aeb453f 100644
--- a/docs/api/utils.bench.html
+++ b/docs/api/utils.bench.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.comet_.html b/docs/api/utils.callbacks.comet_.html
index 70d420859..3a8b2d853 100644
--- a/docs/api/utils.callbacks.comet_.html
+++ b/docs/api/utils.callbacks.comet_.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.lisa.html b/docs/api/utils.callbacks.lisa.html
index 61d79594b..4780df80e 100644
--- a/docs/api/utils.callbacks.lisa.html
+++ b/docs/api/utils.callbacks.lisa.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.mlflow_.html b/docs/api/utils.callbacks.mlflow_.html
index f78f89656..c79b60dcf 100644
--- a/docs/api/utils.callbacks.mlflow_.html
+++ b/docs/api/utils.callbacks.mlflow_.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.perplexity.html b/docs/api/utils.callbacks.perplexity.html
index c91b3b240..a77ac5a5b 100644
--- a/docs/api/utils.callbacks.perplexity.html
+++ b/docs/api/utils.callbacks.perplexity.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.profiler.html b/docs/api/utils.callbacks.profiler.html
index 2a39f5dc4..4e7dcf7c8 100644
--- a/docs/api/utils.callbacks.profiler.html
+++ b/docs/api/utils.callbacks.profiler.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.callbacks.qat.html b/docs/api/utils.callbacks.qat.html
index c26d57381..77071a443 100644
--- a/docs/api/utils.callbacks.qat.html
+++ b/docs/api/utils.callbacks.qat.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.chat_templates.html b/docs/api/utils.chat_templates.html
index 4d1c9ad04..633f89d72 100644
--- a/docs/api/utils.chat_templates.html
+++ b/docs/api/utils.chat_templates.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.collators.batching.html b/docs/api/utils.collators.batching.html
index 8051bce6a..a1c178cab 100644
--- a/docs/api/utils.collators.batching.html
+++ b/docs/api/utils.collators.batching.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.collators.core.html b/docs/api/utils.collators.core.html
index f71f9da4f..51d313df3 100644
--- a/docs/api/utils.collators.core.html
+++ b/docs/api/utils.collators.core.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.collators.mamba.html b/docs/api/utils.collators.mamba.html
index 6ff499b1b..ab72cb220 100644
--- a/docs/api/utils.collators.mamba.html
+++ b/docs/api/utils.collators.mamba.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.collators.mm_chat.html b/docs/api/utils.collators.mm_chat.html
index a8ca0c961..c86bf8636 100644
--- a/docs/api/utils.collators.mm_chat.html
+++ b/docs/api/utils.collators.mm_chat.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.ctx_managers.sequence_parallel.html b/docs/api/utils.ctx_managers.sequence_parallel.html
index 2ed4b32ea..81f00d065 100644
--- a/docs/api/utils.ctx_managers.sequence_parallel.html
+++ b/docs/api/utils.ctx_managers.sequence_parallel.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.data.pretraining.html b/docs/api/utils.data.pretraining.html
index e8b748f94..14fab1caf 100644
--- a/docs/api/utils.data.pretraining.html
+++ b/docs/api/utils.data.pretraining.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.data.sft.html b/docs/api/utils.data.sft.html
index f7312aad5..e732565a2 100644
--- a/docs/api/utils.data.sft.html
+++ b/docs/api/utils.data.sft.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.dict.html b/docs/api/utils.dict.html
index 5514987af..c818d3299 100644
--- a/docs/api/utils.dict.html
+++ b/docs/api/utils.dict.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.distributed.html b/docs/api/utils.distributed.html
index a99b337ef..d1eb67278 100644
--- a/docs/api/utils.distributed.html
+++ b/docs/api/utils.distributed.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.freeze.html b/docs/api/utils.freeze.html
index f6a01374d..13793b878 100644
--- a/docs/api/utils.freeze.html
+++ b/docs/api/utils.freeze.html
@@ -420,9 +420,6 @@ window.Quarto = {
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -469,6 +466,12 @@ window.Quarto = {
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.lora.html b/docs/api/utils.lora.html
index a2c0b6500..25e61e314 100644
--- a/docs/api/utils.lora.html
+++ b/docs/api/utils.lora.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.model_shard_quant.html b/docs/api/utils.model_shard_quant.html
index 50113f622..3f80d845e 100644
--- a/docs/api/utils.model_shard_quant.html
+++ b/docs/api/utils.model_shard_quant.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.optimizers.adopt.html b/docs/api/utils.optimizers.adopt.html
index 998f5ddc3..110a02c96 100644
--- a/docs/api/utils.optimizers.adopt.html
+++ b/docs/api/utils.optimizers.adopt.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.quantization.html b/docs/api/utils.quantization.html
index 12fd98a57..fd4cb691c 100644
--- a/docs/api/utils.quantization.html
+++ b/docs/api/utils.quantization.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.samplers.multipack.html b/docs/api/utils.samplers.multipack.html
index 2f2445f39..ce8088e15 100644
--- a/docs/api/utils.samplers.multipack.html
+++ b/docs/api/utils.samplers.multipack.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schedulers.html b/docs/api/utils.schedulers.html
index aa47e699f..68de738c7 100644
--- a/docs/api/utils.schedulers.html
+++ b/docs/api/utils.schedulers.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.config.html b/docs/api/utils.schemas.config.html
index acbf1d419..5392585f8 100644
--- a/docs/api/utils.schemas.config.html
+++ b/docs/api/utils.schemas.config.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.datasets.html b/docs/api/utils.schemas.datasets.html
index c652b3fb4..55dba8dc4 100644
--- a/docs/api/utils.schemas.datasets.html
+++ b/docs/api/utils.schemas.datasets.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.enums.html b/docs/api/utils.schemas.enums.html
index 29b3b1501..22ad43d6a 100644
--- a/docs/api/utils.schemas.enums.html
+++ b/docs/api/utils.schemas.enums.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.integrations.html b/docs/api/utils.schemas.integrations.html
index f25d01549..44595bf43 100644
--- a/docs/api/utils.schemas.integrations.html
+++ b/docs/api/utils.schemas.integrations.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.model.html b/docs/api/utils.schemas.model.html
index f9bd50fc7..add908c40 100644
--- a/docs/api/utils.schemas.model.html
+++ b/docs/api/utils.schemas.model.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.multimodal.html b/docs/api/utils.schemas.multimodal.html
index 9bf46fe63..5d3166ca3 100644
--- a/docs/api/utils.schemas.multimodal.html
+++ b/docs/api/utils.schemas.multimodal.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.peft.html b/docs/api/utils.schemas.peft.html
index 5020f3b00..cfcf07e04 100644
--- a/docs/api/utils.schemas.peft.html
+++ b/docs/api/utils.schemas.peft.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.training.html b/docs/api/utils.schemas.training.html
index 7a7f751ed..0afa24e77 100644
--- a/docs/api/utils.schemas.training.html
+++ b/docs/api/utils.schemas.training.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.trl.html b/docs/api/utils.schemas.trl.html
index 137ba0aaa..981cc0af2 100644
--- a/docs/api/utils.schemas.trl.html
+++ b/docs/api/utils.schemas.trl.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.schemas.utils.html b/docs/api/utils.schemas.utils.html
index 5302a788a..6ac0e5734 100644
--- a/docs/api/utils.schemas.utils.html
+++ b/docs/api/utils.schemas.utils.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.tokenization.html b/docs/api/utils.tokenization.html
index fb85de911..8062ecec3 100644
--- a/docs/api/utils.tokenization.html
+++ b/docs/api/utils.tokenization.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/api/utils.trainer.html b/docs/api/utils.trainer.html
index ed79611e9..cb71c7ca2 100644
--- a/docs/api/utils.trainer.html
+++ b/docs/api/utils.trainer.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/batch_vs_grad.html b/docs/batch_vs_grad.html
index 3ef84687a..ae2557d86 100644
--- a/docs/batch_vs_grad.html
+++ b/docs/batch_vs_grad.html
@@ -357,9 +357,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -406,6 +403,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/cli.html b/docs/cli.html
index ed80838f8..3f571d1b9 100644
--- a/docs/cli.html
+++ b/docs/cli.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/config-reference.html b/docs/config-reference.html
index 04baa8395..b5b831f01 100644
--- a/docs/config-reference.html
+++ b/docs/config-reference.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/custom_integrations.html b/docs/custom_integrations.html
index 0bbd82a78..3d07cf47d 100644
--- a/docs/custom_integrations.html
+++ b/docs/custom_integrations.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/conversation.html b/docs/dataset-formats/conversation.html
index 99d957127..0216906f7 100644
--- a/docs/dataset-formats/conversation.html
+++ b/docs/dataset-formats/conversation.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/index.html b/docs/dataset-formats/index.html
index 33820b638..7f5741c3a 100644
--- a/docs/dataset-formats/index.html
+++ b/docs/dataset-formats/index.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/inst_tune.html b/docs/dataset-formats/inst_tune.html
index 8cc7d4c2d..f274274ad 100644
--- a/docs/dataset-formats/inst_tune.html
+++ b/docs/dataset-formats/inst_tune.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/pretraining.html b/docs/dataset-formats/pretraining.html
index c1c503cb5..f300a1d4b 100644
--- a/docs/dataset-formats/pretraining.html
+++ b/docs/dataset-formats/pretraining.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/stepwise_supervised.html b/docs/dataset-formats/stepwise_supervised.html
index 997233316..d7d21078d 100644
--- a/docs/dataset-formats/stepwise_supervised.html
+++ b/docs/dataset-formats/stepwise_supervised.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/template_free.html b/docs/dataset-formats/template_free.html
index 65c2b2b4f..9148ab9d4 100644
--- a/docs/dataset-formats/template_free.html
+++ b/docs/dataset-formats/template_free.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset-formats/tokenized.html b/docs/dataset-formats/tokenized.html
index 62b042932..3e494ea99 100644
--- a/docs/dataset-formats/tokenized.html
+++ b/docs/dataset-formats/tokenized.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset_loading.html b/docs/dataset_loading.html
index 3c833b543..9c751cc8a 100644
--- a/docs/dataset_loading.html
+++ b/docs/dataset_loading.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/dataset_preprocessing.html b/docs/dataset_preprocessing.html
index d9237bd08..f1059cbe5 100644
--- a/docs/dataset_preprocessing.html
+++ b/docs/dataset_preprocessing.html
@@ -357,9 +357,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -406,6 +403,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/debugging.html b/docs/debugging.html
index b4712958d..d6c0ea92f 100644
--- a/docs/debugging.html
+++ b/docs/debugging.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/docker.html b/docs/docker.html
index 788215cc4..4169751a4 100644
--- a/docs/docker.html
+++ b/docs/docker.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/faq.html b/docs/faq.html
index f8bc786c4..192b1b373 100644
--- a/docs/faq.html
+++ b/docs/faq.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/fsdp_qlora.html b/docs/fsdp_qlora.html
index 04cc9dff8..12c375d88 100644
--- a/docs/fsdp_qlora.html
+++ b/docs/fsdp_qlora.html
@@ -357,9 +357,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -406,6 +403,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/getting-started.html b/docs/getting-started.html
index 658c3ab45..d4ee189b9 100644
--- a/docs/getting-started.html
+++ b/docs/getting-started.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/gradient_checkpointing.html b/docs/gradient_checkpointing.html
index 821ca2994..0584e6b2d 100644
--- a/docs/gradient_checkpointing.html
+++ b/docs/gradient_checkpointing.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link active">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/inference.html b/docs/inference.html
index bcd6734fb..04db338b6 100644
--- a/docs/inference.html
+++ b/docs/inference.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/input_output.html b/docs/input_output.html
index 14689f065..6061e060a 100644
--- a/docs/input_output.html
+++ b/docs/input_output.html
@@ -357,9 +357,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -406,6 +403,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/installation.html b/docs/installation.html
index ed5a311ab..116f48e25 100644
--- a/docs/installation.html
+++ b/docs/installation.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/lora_optims.html b/docs/lora_optims.html
index 84d79ede7..69e5d60bd 100644
--- a/docs/lora_optims.html
+++ b/docs/lora_optims.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/lr_groups.html b/docs/lr_groups.html
index f38d47640..7b74b30c4 100644
--- a/docs/lr_groups.html
+++ b/docs/lr_groups.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/mac.html b/docs/mac.html
index 34e6163fc..09d7e6090 100644
--- a/docs/mac.html
+++ b/docs/mac.html
@@ -357,9 +357,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -406,6 +403,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/mixed_precision.html b/docs/mixed_precision.html
index 6e20bb44e..a1395b00f 100644
--- a/docs/mixed_precision.html
+++ b/docs/mixed_precision.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/multi-gpu.html b/docs/multi-gpu.html
index b53a8a8ef..78cf6f134 100644
--- a/docs/multi-gpu.html
+++ b/docs/multi-gpu.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/multi-node.html b/docs/multi-node.html
index 491daafea..3c8f6d439 100644
--- a/docs/multi-node.html
+++ b/docs/multi-node.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/multimodal.html b/docs/multimodal.html
index b82a4d4f5..cc8be4d47 100644
--- a/docs/multimodal.html
+++ b/docs/multimodal.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/multipack.html b/docs/multipack.html
index 35b9416a2..280cdfd6d 100644
--- a/docs/multipack.html
+++ b/docs/multipack.html
@@ -357,9 +357,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -406,6 +403,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/nccl.html b/docs/nccl.html
index 4739bb75e..4a6184a40 100644
--- a/docs/nccl.html
+++ b/docs/nccl.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/nd_parallelism.html b/docs/nd_parallelism.html
new file mode 100644
index 000000000..98c3adadf
--- /dev/null
+++ b/docs/nd_parallelism.html
@@ -0,0 +1,1107 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.7.32">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>nd_parallelism – Axolotl</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+html { -webkit-text-size-adjust: 100%; }
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../favicon.jpg" rel="icon" type="image/jpeg">
+<script src="../site_libs/quarto-html/quarto.js" type="module"></script>
+<script src="../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting-dark-2fef5ea3f8957b3e4ecc936fc74692ca.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap-4286dd70669dc30dbb11cd1e43bae81e.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
+<script id="quarto-search-options" type="application/json">{
+  "location": "navbar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "end",
+  "type": "overlay",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "show-item-context": false,
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-9KYCVJBNMQ"></script>
+
+<script type="text/javascript">
+
+window.dataLayer = window.dataLayer || [];
+function gtag(){dataLayer.push(arguments);}
+gtag('js', new Date());
+gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
+</script>
+
+
+<link rel="stylesheet" href="../styles.css">
+</head>
+
+<body class="nav-sidebar docked nav-fixed quarto-light">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+    <nav class="navbar navbar-expand " data-bs-theme="dark">
+      <div class="navbar-container container-fluid">
+      <div class="navbar-brand-container mx-auto">
+    <a href="../index.html" class="navbar-brand navbar-brand-logo">
+    <img src="../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo">
+    </a>
+  </div>
+        <div class="quarto-navbar-tools tools-wide tools-end">
+    <a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
+    <a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
+</div>
+          <div id="quarto-search" class="" title="Search"></div>
+      </div> <!-- /container-fluid -->
+    </nav>
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/fsdp_qlora.html">Advanced Features</a></li><li class="breadcrumb-item"><a href="../docs/nd_parallelism.html">N-D Parallelism</a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Home</span></a>
+  </div>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
+ <span class="menu-text">Getting Started</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/getting-started.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Quickstart</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/installation.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Installation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/inference.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Inference and Merging</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/cli.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Command Line Interface (CLI)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/config-reference.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Config Reference</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/api" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">API Reference</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Formats</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Pre-training</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Instruction Tuning</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Conversation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Stepwise Supervised Format</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Template-Free</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true">
+ <span class="menu-text">Deployments</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/docker.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Docker</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multi-gpu.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multi-GPU</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multi-node.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multi Node</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">AMD GPUs on HPC Systems</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/mac.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mac M-series</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="true">
+ <span class="menu-text">How To Guides</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multimodal.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">MultiModal / Vision Language Models (BETA)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/rlhf.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">RLHF (Beta)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/reward_modelling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Reward Modelling</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/lr_groups.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Learning Rate Groups</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/lora_optims.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">LoRA Optimizations</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset_loading.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Loading</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/qat.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Quantization Aware Training (QAT)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/quantize.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Quantization with torchao</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true">
+ <span class="menu-text">Core Concepts</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/batch_vs_grad.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Batch size vs Gradient accumulation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Preprocessing</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multipack.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multipack (Sample Packing)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/mixed_precision.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mixed Precision Training</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
+ <span class="menu-text">Advanced Features</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">FDSP + QLoRA</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Unsloth</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/torchao.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">PyTorch ao</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/custom_integrations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Custom Integrations</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/sequence_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Sequence Parallelism</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true">
+ <span class="menu-text">Troubleshooting</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-7" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/faq.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">FAQ</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/debugging.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Debugging</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nccl.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">NCCL</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">On this page</h2>
+   
+  <ul>
+  <li><a href="#n-d-parallelism" id="toc-n-d-parallelism" class="nav-link active" data-scroll-target="#n-d-parallelism">N-D Parallelism</a>
+  <ul class="collapse">
+  <li><a href="#core-concepts" id="toc-core-concepts" class="nav-link" data-scroll-target="#core-concepts">Core Concepts</a>
+  <ul class="collapse">
+  <li><a href="#sec-dp" id="toc-sec-dp" class="nav-link" data-scroll-target="#sec-dp">Data Parallelism</a></li>
+  <li><a href="#sec-tp" id="toc-sec-tp" class="nav-link" data-scroll-target="#sec-tp">[Experimental] Tensor Parallelism (TP)</a></li>
+  <li><a href="#sec-cp" id="toc-sec-cp" class="nav-link" data-scroll-target="#sec-cp">Context Parallelism (CP)</a></li>
+  <li><a href="#sec-hsdp" id="toc-sec-hsdp" class="nav-link" data-scroll-target="#sec-hsdp">Hybrid Sharding Data Parallel (HSDP)</a></li>
+  </ul></li>
+  <li><a href="#usage" id="toc-usage" class="nav-link" data-scroll-target="#usage">Usage</a></li>
+  <li><a href="#examples" id="toc-examples" class="nav-link" data-scroll-target="#examples">Examples</a></li>
+  <li><a href="#support-matrix" id="toc-support-matrix" class="nav-link" data-scroll-target="#support-matrix">Support Matrix</a></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/fsdp_qlora.html">Advanced Features</a></li><li class="breadcrumb-item"><a href="../docs/nd_parallelism.html">N-D Parallelism</a></li></ol></nav></header>
+
+
+
+
+<section id="n-d-parallelism" class="level1">
+<h1>N-D Parallelism</h1>
+<p>Axolotl enables training models at scale by composing different parallelism techniques. This is essential when:</p>
+<ul>
+<li>A model’s weights are too large to fit on a single GPU’s memory.</li>
+<li>A model’s activations, especially with very long contexts, are too large for a single GPU.</li>
+<li>You want to accelerate training by using multiple GPUs or nodes.</li>
+</ul>
+<p>or combinations of the above!</p>
+<section id="core-concepts" class="level2">
+<h2 class="anchored" data-anchor-id="core-concepts">Core Concepts</h2>
+<p>Parallelism strategies can be combined. The key is understanding how each one divides the workload. PyTorch’s <code>DeviceMesh</code> is the modern way to manage these combinations, creating a logical grid of your GPUs and assigning different parallel strategies to different dimensions of the grid.</p>
+<section id="sec-dp" class="level3">
+<h3 class="anchored" data-anchor-id="sec-dp">Data Parallelism</h3>
+<p>Data Parallelism focuses on splitting the global data batch across GPUs.</p>
+<ul>
+<li><p>Distributed Data Parallel (DDP): The classic approach. The full model is replicated on every GPU. Each GPU processes a different slice of the data batch. Gradients are then averaged across all GPUs after the backward pass to keep the models synchronized. This can substantially improve data throughput compared to single-device training, but requires that each GPU is able to hold the entire model, its gradients, and optimizer states.</p></li>
+<li><p><a href="../docs/multi-gpu.html#fully-sharded-data-parallel-(fsdp)">Fully Sharded Data Parallel (FSDP)</a>: A highly memory-efficient form of data parallelism (inspired by DeepSpeed’s ZeRO). Instead of replicating the model, FSDP shards the model’s <em>parameters, gradients, and optimizer states</em> across the GPUs in the data-parallel group. During computation, each GPU receives the specific parameters it needs via an <code>all_gather</code> operation just before they are used, and they can be discarded immediately after (<code>reshard-after-forward</code>).</p>
+<ul>
+<li>FSDP maps to ZeRO stages:
+<ul>
+<li>ZeRO-2 (<code>reshard_after_forward=False</code>): Shards gradients and optimizer states. Model weights are replicated on each GPU.</li>
+<li>ZeRO-3 (<code>reshard_after_forward=True</code>): Shards gradients, optimizer states, AND model parameters. This provides the most memory savings at the cost of more communication (re-gathering parameters for both forward and backward passes).</li>
+</ul></li>
+</ul></li>
+</ul>
+</section>
+<section id="sec-tp" class="level3">
+<h3 class="anchored" data-anchor-id="sec-tp">[Experimental] Tensor Parallelism (TP)</h3>
+<p>Also known as “horizontal model parallelism,” as described in the <a href="https://arxiv.org/pdf/1909.08053.pdf">Megatron-LM paper</a>. Instead of splitting the batch, TP splits the model’s layers themselves across GPUs.</p>
+<ul>
+<li>How it works: For a linear layer <code>Y = XA</code>, the weight matrix <code>A</code> is split column-wise (<code>A = [A_1, A_2]</code>). The computation becomes <code>Y_1 = XA_1</code> and <code>Y_2 = XA_2</code>, which can happen in parallel on different GPUs. The final output <code>Y</code> is simply the concatenation of <code>Y_1</code> and <code>Y_2</code>. Check <a href="https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530">this comment</a> for more detailed info.</li>
+<li>Requirement: TP involves frequent, small communications within a forward/backward pass. It requires a very fast interconnect between GPUs (e.g., NVLink) and is typically not recommended across different nodes.</li>
+</ul>
+</section>
+<section id="sec-cp" class="level3">
+<h3 class="anchored" data-anchor-id="sec-cp">Context Parallelism (CP)</h3>
+<p>Context Parallelism, also called <a href="../docs/sequence_parallelism.html">Sequence Parallelism</a>, addresses the memory bottleneck from long sequences. The input sequence itself is split along the sequence length dimension and distributed across GPUs.</p>
+<ul>
+<li>How it works: If you have a sequence of 8192 tokens and a <code>context_parallel_size</code> of 4, each GPU will only handle a chunk of 2048 tokens.</li>
+<li>The Challenge: Attention is not local; every token needs to “attend to” every other token. Splitting the sequence breaks this.</li>
+<li>The Solution (<code>ring-flash-attention</code>): An efficient communication protocol is used. To compute attention for its local sequence chunk, each GPU passes its Key-Value (KV) cache to its neighbor in a “ring.” After <code>N-1</code> steps, every GPU has seen the KV-cache from all other GPUs, allowing it to compute the correct attention values for its chunk. This is implemented using the highly optimized <code>flash-attention</code> kernel at each step.</li>
+</ul>
+</section>
+<section id="sec-hsdp" class="level3">
+<h3 class="anchored" data-anchor-id="sec-hsdp">Hybrid Sharding Data Parallel (HSDP)</h3>
+<p>HSDP is a 2D strategy that intelligently combines FSDP and DDP, typically for multi-node training.</p>
+<ul>
+<li>Intra-Node (within a machine): Use FSDP. This is efficient because GPUs on the same node have fast interconnects (NVLink), making the <code>all_gather</code> operations for sharded parameters fast.</li>
+<li>Inter-Node (across machines): Use DDP. The gradient synchronization between nodes is less frequent than FSDP’s parameter gathering, making it a better fit for the slower node-to-node network (e.g., Ethernet/Infiniband).</li>
+<li>Example: With 2 nodes of 8 GPUs each (16 total), you could have <code>dp_shard_size=8</code> (FSDP within each node) and <code>dp_replicate_size=2</code> (DDP across the two nodes).</li>
+</ul>
+</section>
+</section>
+<section id="usage" class="level2">
+<h2 class="anchored" data-anchor-id="usage">Usage</h2>
+<div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP config. See https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="co">  # ...</span></span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of GPUs to shard the model parameters across (FSDP dimension).</span></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_shard_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of times to replicate the sharded model (DDP dimension).</span></span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_replicate_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of GPUs for Tensor Parallelism.</span></span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="fu">tensor_parallel_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span><span class="co">  # (default is 1, no TP)</span></span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of GPUs for Context/Sequence Parallelism.</span></span>
+<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="fu">context_parallel_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span><span class="co"> # (default is 1, no CP)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Note: We recommend FSDP. DeepSpeed is only compatible with <code>tensor_parallel_size</code>.</p>
+</section>
+<section id="examples" class="level2">
+<h2 class="anchored" data-anchor-id="examples">Examples</h2>
+<ol type="1">
+<li>HSDP on 2 nodes with 4 GPUs each (8 GPUs total):
+<ul>
+<li>You want FSDP within each node and DDP across nodes.</li>
+<li>Set <code>dp_shard_size: 4</code> and <code>dp_replicate_size: 2</code>.</li>
+</ul></li>
+<li>FSDP + TP on a single 8-GPU node:
+<ul>
+<li>You want to split the model across 4 GPUs using FSDP, and further split each layer across 2 GPUs with TP.</li>
+<li>Set <code>dp_shard_size: 4</code> and <code>tensor_parallel_size: 2</code>.</li>
+</ul></li>
+<li>FSDP + CP on a single 8-GPU node for long context:
+<ul>
+<li>You want to shard the model across all 8 GPUs and also split the sequence length across all 8 GPUs.</li>
+<li>Set <code>dp_shard_size: 8</code> and <code>context_parallel_size: 8</code>. Note: this means the data parallel group and context parallel group are the same. A more common setup might be to shard across a smaller group.</li>
+</ul></li>
+</ol>
+</section>
+<section id="support-matrix" class="level2">
+<h2 class="anchored" data-anchor-id="support-matrix">Support Matrix</h2>
+<p>This matrix describes how different parallelism methods can be combined in Axolotl.</p>
+<table class="caption-top table">
+<colgroup>
+<col style="width: 11%">
+<col style="width: 19%">
+<col style="width: 19%">
+<col style="width: 19%">
+<col style="width: 19%">
+<col style="width: 11%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Combination</th>
+<th style="text-align: center;"><code>dp_replicate_size</code></th>
+<th style="text-align: center;"><code>dp_shard_size</code></th>
+<th style="text-align: center;"><code>tp_size</code></th>
+<th style="text-align: center;"><code>cp_size</code></th>
+<th>Status &amp; Notes</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><strong>FSDP</strong> (ZeRO-3)</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">1</td>
+<td>✅ Fully supported. Shards model across all GPUs.</td>
+</tr>
+<tr class="even">
+<td><strong>HSDP</strong></td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">1</td>
+<td>✅ Fully supported. FSDP intra-node, DDP inter-node.</td>
+</tr>
+<tr class="odd">
+<td><strong>FSDP + TP</strong></td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">1</td>
+<td>✅ <strong>2D Parallelism</strong>. Shards the model across a <code>dp_shard</code> group, and TP-splits layers within the <code>tp</code> group.</td>
+</tr>
+<tr class="even">
+<td><strong>HSDP + TP</strong></td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">1</td>
+<td>✅ <strong>3D Parallelism</strong>. A powerful but complex combination.</td>
+</tr>
+<tr class="odd">
+<td><strong>FSDP + CP</strong></td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">&gt;1</td>
+<td>✅ <strong>2D Parallelism</strong>. Combines FSDP with context parallelism.</td>
+</tr>
+<tr class="even">
+<td><strong>FSDP + TP + CP</strong></td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">&gt;1</td>
+<td>✅ <strong>3D Parallelism</strong>. Another advanced combination.</td>
+</tr>
+<tr class="odd">
+<td>DDP + TP/CP</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">&gt;1</td>
+<td>❌ <strong>Not Supported</strong>. The <code>ParallelismConfig</code> explicitly prevents this, as composing pure DDP with TP/CP without FSDP is inefficient and complex. You should use FSDP instead (<code>dp_shard_size &gt; 1</code>).</td>
+</tr>
+<tr class="even">
+<td>Just TP / CP</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">1</td>
+<td style="text-align: center;">&gt;1</td>
+<td style="text-align: center;">&gt;1</td>
+<td>✅ Supported. Useful for inference or when the model fits on one GPU but context is too long.</td>
+</tr>
+</tbody>
+</table>
+<ul>
+<li><code>tp_size</code> refers to <code>tensor_parallel_size</code></li>
+<li><code>cp_size</code> refers to <code>context_parallel_size</code></li>
+</ul>
+
+
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+  window.document.addEventListener("DOMContentLoaded", function (event) {
+    const icon = "";
+    const anchorJS = new window.AnchorJS();
+    anchorJS.options = {
+      placement: 'right',
+      icon: icon
+    };
+    anchorJS.add('.anchored');
+    const isCodeAnnotation = (el) => {
+      for (const clz of el.classList) {
+        if (clz.startsWith('code-annotation-')) {                     
+          return true;
+        }
+      }
+      return false;
+    }
+    const onCopySuccess = function(e) {
+      // button target
+      const button = e.trigger;
+      // don't keep focus
+      button.blur();
+      // flash "checked"
+      button.classList.add('code-copy-button-checked');
+      var currentTitle = button.getAttribute("title");
+      button.setAttribute("title", "Copied!");
+      let tooltip;
+      if (window.bootstrap) {
+        button.setAttribute("data-bs-toggle", "tooltip");
+        button.setAttribute("data-bs-placement", "left");
+        button.setAttribute("data-bs-title", "Copied!");
+        tooltip = new bootstrap.Tooltip(button, 
+          { trigger: "manual", 
+            customClass: "code-copy-button-tooltip",
+            offset: [0, -8]});
+        tooltip.show();    
+      }
+      setTimeout(function() {
+        if (tooltip) {
+          tooltip.hide();
+          button.removeAttribute("data-bs-title");
+          button.removeAttribute("data-bs-toggle");
+          button.removeAttribute("data-bs-placement");
+        }
+        button.setAttribute("title", currentTitle);
+        button.classList.remove('code-copy-button-checked');
+      }, 1000);
+      // clear code selection
+      e.clearSelection();
+    }
+    const getTextToCopy = function(trigger) {
+        const codeEl = trigger.previousElementSibling.cloneNode(true);
+        for (const childEl of codeEl.children) {
+          if (isCodeAnnotation(childEl)) {
+            childEl.remove();
+          }
+        }
+        return codeEl.innerText;
+    }
+    const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+      text: getTextToCopy
+    });
+    clipboard.on('success', onCopySuccess);
+    if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+      const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+        text: getTextToCopy,
+        container: window.document.getElementById('quarto-embedded-source-code-modal')
+      });
+      clipboardModal.on('success', onCopySuccess);
+    }
+      var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+      var mailtoRegex = new RegExp(/^mailto:/);
+        var filterRegex = new RegExp("https:\/\/docs\.axolotl\.ai");
+      var isInternal = (href) => {
+          return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+      }
+      // Inspect non-navigation links and adorn them if external
+     var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+      for (var i=0; i<links.length; i++) {
+        const link = links[i];
+        if (!isInternal(link.href)) {
+          // undo the damage that might have been done by quarto-nav.js in the case of
+          // links that we want to consider external
+          if (link.dataset.originalHref !== undefined) {
+            link.href = link.dataset.originalHref;
+          }
+        }
+      }
+    function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+      const config = {
+        allowHTML: true,
+        maxWidth: 500,
+        delay: 100,
+        arrow: false,
+        appendTo: function(el) {
+            return el.parentElement;
+        },
+        interactive: true,
+        interactiveBorder: 10,
+        theme: 'quarto',
+        placement: 'bottom-start',
+      };
+      if (contentFn) {
+        config.content = contentFn;
+      }
+      if (onTriggerFn) {
+        config.onTrigger = onTriggerFn;
+      }
+      if (onUntriggerFn) {
+        config.onUntrigger = onUntriggerFn;
+      }
+      window.tippy(el, config); 
+    }
+    const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+    for (var i=0; i<noterefs.length; i++) {
+      const ref = noterefs[i];
+      tippyHover(ref, function() {
+        // use id or data attribute instead here
+        let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+        try { href = new URL(href).hash; } catch {}
+        const id = href.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note) {
+          return note.innerHTML;
+        } else {
+          return "";
+        }
+      });
+    }
+    const xrefs = window.document.querySelectorAll('a.quarto-xref');
+    const processXRef = (id, note) => {
+      // Strip column container classes
+      const stripColumnClz = (el) => {
+        el.classList.remove("page-full", "page-columns");
+        if (el.children) {
+          for (const child of el.children) {
+            stripColumnClz(child);
+          }
+        }
+      }
+      stripColumnClz(note)
+      if (id === null || id.startsWith('sec-')) {
+        // Special case sections, only their first couple elements
+        const container = document.createElement("div");
+        if (note.children && note.children.length > 2) {
+          container.appendChild(note.children[0].cloneNode(true));
+          for (let i = 1; i < note.children.length; i++) {
+            const child = note.children[i];
+            if (child.tagName === "P" && child.innerText === "") {
+              continue;
+            } else {
+              container.appendChild(child.cloneNode(true));
+              break;
+            }
+          }
+          if (window.Quarto?.typesetMath) {
+            window.Quarto.typesetMath(container);
+          }
+          return container.innerHTML
+        } else {
+          if (window.Quarto?.typesetMath) {
+            window.Quarto.typesetMath(note);
+          }
+          return note.innerHTML;
+        }
+      } else {
+        // Remove any anchor links if they are present
+        const anchorLink = note.querySelector('a.anchorjs-link');
+        if (anchorLink) {
+          anchorLink.remove();
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        if (note.classList.contains("callout")) {
+          return note.outerHTML;
+        } else {
+          return note.innerHTML;
+        }
+      }
+    }
+    for (var i=0; i<xrefs.length; i++) {
+      const xref = xrefs[i];
+      tippyHover(xref, undefined, function(instance) {
+        instance.disable();
+        let url = xref.getAttribute('href');
+        let hash = undefined; 
+        if (url.startsWith('#')) {
+          hash = url;
+        } else {
+          try { hash = new URL(url).hash; } catch {}
+        }
+        if (hash) {
+          const id = hash.replace(/^#\/?/, "");
+          const note = window.document.getElementById(id);
+          if (note !== null) {
+            try {
+              const html = processXRef(id, note.cloneNode(true));
+              instance.setContent(html);
+            } finally {
+              instance.enable();
+              instance.show();
+            }
+          } else {
+            // See if we can fetch this
+            fetch(url.split('#')[0])
+            .then(res => res.text())
+            .then(html => {
+              const parser = new DOMParser();
+              const htmlDoc = parser.parseFromString(html, "text/html");
+              const note = htmlDoc.getElementById(id);
+              if (note !== null) {
+                const html = processXRef(id, note);
+                instance.setContent(html);
+              } 
+            }).finally(() => {
+              instance.enable();
+              instance.show();
+            });
+          }
+        } else {
+          // See if we can fetch a full url (with no hash to target)
+          // This is a special case and we should probably do some content thinning / targeting
+          fetch(url)
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.querySelector('main.content');
+            if (note !== null) {
+              // This should only happen for chapter cross references
+              // (since there is no id in the URL)
+              // remove the first header
+              if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+                note.children[0].remove();
+              }
+              const html = processXRef(null, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      }, function(instance) {
+      });
+    }
+        let selectedAnnoteEl;
+        const selectorForAnnotation = ( cell, annotation) => {
+          let cellAttr = 'data-code-cell="' + cell + '"';
+          let lineAttr = 'data-code-annotation="' +  annotation + '"';
+          const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+          return selector;
+        }
+        const selectCodeLines = (annoteEl) => {
+          const doc = window.document;
+          const targetCell = annoteEl.getAttribute("data-target-cell");
+          const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+          const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+          const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+          const lineIds = lines.map((line) => {
+            return targetCell + "-" + line;
+          })
+          let top = null;
+          let height = null;
+          let parent = null;
+          if (lineIds.length > 0) {
+              //compute the position of the single el (top and bottom and make a div)
+              const el = window.document.getElementById(lineIds[0]);
+              top = el.offsetTop;
+              height = el.offsetHeight;
+              parent = el.parentElement.parentElement;
+            if (lineIds.length > 1) {
+              const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+              const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+              height = bottom - top;
+            }
+            if (top !== null && height !== null && parent !== null) {
+              // cook up a div (if necessary) and position it 
+              let div = window.document.getElementById("code-annotation-line-highlight");
+              if (div === null) {
+                div = window.document.createElement("div");
+                div.setAttribute("id", "code-annotation-line-highlight");
+                div.style.position = 'absolute';
+                parent.appendChild(div);
+              }
+              div.style.top = top - 2 + "px";
+              div.style.height = height + 4 + "px";
+              div.style.left = 0;
+              let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+              if (gutterDiv === null) {
+                gutterDiv = window.document.createElement("div");
+                gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+                gutterDiv.style.position = 'absolute';
+                const codeCell = window.document.getElementById(targetCell);
+                const gutter = codeCell.querySelector('.code-annotation-gutter');
+                gutter.appendChild(gutterDiv);
+              }
+              gutterDiv.style.top = top - 2 + "px";
+              gutterDiv.style.height = height + 4 + "px";
+            }
+            selectedAnnoteEl = annoteEl;
+          }
+        };
+        const unselectCodeLines = () => {
+          const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+          elementsIds.forEach((elId) => {
+            const div = window.document.getElementById(elId);
+            if (div) {
+              div.remove();
+            }
+          });
+          selectedAnnoteEl = undefined;
+        };
+          // Handle positioning of the toggle
+      window.addEventListener(
+        "resize",
+        throttle(() => {
+          elRect = undefined;
+          if (selectedAnnoteEl) {
+            selectCodeLines(selectedAnnoteEl);
+          }
+        }, 10)
+      );
+      function throttle(fn, ms) {
+      let throttle = false;
+      let timer;
+        return (...args) => {
+          if(!throttle) { // first call gets through
+              fn.apply(this, args);
+              throttle = true;
+          } else { // all the others get throttled
+              if(timer) clearTimeout(timer); // cancel #2
+              timer = setTimeout(() => {
+                fn.apply(this, args);
+                timer = throttle = false;
+              }, ms);
+          }
+        };
+      }
+        // Attach click handler to the DT
+        const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+        for (const annoteDlNode of annoteDls) {
+          annoteDlNode.addEventListener('click', (event) => {
+            const clickedEl = event.target;
+            if (clickedEl !== selectedAnnoteEl) {
+              unselectCodeLines();
+              const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+              if (activeEl) {
+                activeEl.classList.remove('code-annotation-active');
+              }
+              selectCodeLines(clickedEl);
+              clickedEl.classList.add('code-annotation-active');
+            } else {
+              // Unselect the line
+              unselectCodeLines();
+              clickedEl.classList.remove('code-annotation-active');
+            }
+          });
+        }
+    const findCites = (el) => {
+      const parentEl = el.parentElement;
+      if (parentEl) {
+        const cites = parentEl.dataset.cites;
+        if (cites) {
+          return {
+            el,
+            cites: cites.split(' ')
+          };
+        } else {
+          return findCites(el.parentElement)
+        }
+      } else {
+        return undefined;
+      }
+    };
+    var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+    for (var i=0; i<bibliorefs.length; i++) {
+      const ref = bibliorefs[i];
+      const citeInfo = findCites(ref);
+      if (citeInfo) {
+        tippyHover(citeInfo.el, function() {
+          var popup = window.document.createElement('div');
+          citeInfo.cites.forEach(function(cite) {
+            var citeDiv = window.document.createElement('div');
+            citeDiv.classList.add('hanging-indent');
+            citeDiv.classList.add('csl-entry');
+            var biblioDiv = window.document.getElementById('ref-' + cite);
+            if (biblioDiv) {
+              citeDiv.innerHTML = biblioDiv.innerHTML;
+            }
+            popup.appendChild(citeDiv);
+          });
+          return popup.innerHTML;
+        });
+      }
+    }
+  });
+  </script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/qat.html b/docs/qat.html
index 3a6390476..cb95dfe06 100644
--- a/docs/qat.html
+++ b/docs/qat.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/quantize.html b/docs/quantize.html
index de29ec3e0..67e228551 100644
--- a/docs/quantize.html
+++ b/docs/quantize.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/ray-integration.html b/docs/ray-integration.html
index 7ca5d8f1b..9d4d7d12e 100644
--- a/docs/ray-integration.html
+++ b/docs/ray-integration.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/reward_modelling.html b/docs/reward_modelling.html
index 6b34885f6..3e4a72387 100644
--- a/docs/reward_modelling.html
+++ b/docs/reward_modelling.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/rlhf.html b/docs/rlhf.html
index 93a55df7d..1b18e0e99 100644
--- a/docs/rlhf.html
+++ b/docs/rlhf.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/sequence_parallelism.html b/docs/sequence_parallelism.html
index 750b07adf..be2acd06f 100644
--- a/docs/sequence_parallelism.html
+++ b/docs/sequence_parallelism.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/torchao.html b/docs/torchao.html
index 7cc59eb9b..769608dce 100644
--- a/docs/torchao.html
+++ b/docs/torchao.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/docs/unsloth.html b/docs/unsloth.html
index 1140b319e..4c2beeb5e 100644
--- a/docs/unsloth.html
+++ b/docs/unsloth.html
@@ -392,9 +392,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -441,6 +438,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/examples/colab-notebooks/colab-axolotl-example.html b/examples/colab-notebooks/colab-axolotl-example.html
index f51f72662..a783fb65b 100644
--- a/examples/colab-notebooks/colab-axolotl-example.html
+++ b/examples/colab-notebooks/colab-axolotl-example.html
@@ -395,9 +395,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -444,6 +441,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/index.html b/index.html
index 424de4cca..7ba88971f 100644
--- a/index.html
+++ b/index.html
@@ -391,9 +391,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -440,6 +437,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="./docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/search.json b/search.json
index 022bb2a7e..eec472f44 100644
--- a/search.json
+++ b/search.json
@@ -1595,640 +1595,58 @@
     ]
   },
   {
-    "objectID": "docs/multi-gpu.html",
-    "href": "docs/multi-gpu.html",
-    "title": "Multi-GPU",
+    "objectID": "docs/nd_parallelism.html",
+    "href": "docs/nd_parallelism.html",
+    "title": "N-D Parallelism",
     "section": "",
-    "text": "This guide covers advanced training configurations for multi-GPU setups using Axolotl.",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-overview",
-    "href": "docs/multi-gpu.html#sec-overview",
-    "title": "Multi-GPU",
-    "section": "1 Overview",
-    "text": "1 Overview\nAxolotl supports several methods for multi-GPU training:\n\nDeepSpeed (recommended)\nFSDP (Fully Sharded Data Parallel)\nSequence parallelism\nFSDP + QLoRA",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-deepspeed",
-    "href": "docs/multi-gpu.html#sec-deepspeed",
-    "title": "Multi-GPU",
-    "section": "2 DeepSpeed",
-    "text": "2 DeepSpeed\n\n2.1 Configuration\nAdd to your YAML config:\ndeepspeed: deepspeed_configs/zero1.json\n\n\n2.2 Usage\n# Fetch deepspeed configs (if not already present)\naxolotl fetch deepspeed_configs\n\n# Passing arg via config\naxolotl train config.yml\n\n# Passing arg via cli\naxolotl train config.yml --deepspeed deepspeed_configs/zero1.json\n\n\n2.3 ZeRO Stages\nWe provide default configurations for:\n\nZeRO Stage 1 (zero1.json)\nZeRO Stage 1 with torch compile (zero1_torch_compile.json)\nZeRO Stage 2 (zero2.json)\nZeRO Stage 3 (zero3.json)\nZeRO Stage 3 with bf16 (zero3_bf16.json)\nZeRO Stage 3 with bf16 and CPU offload params(zero3_bf16_cpuoffload_params.json)\nZeRO Stage 3 with bf16 and CPU offload params and optimizer (zero3_bf16_cpuoffload_all.json)\n\n\n\n\n\n\n\nTip\n\n\n\nChoose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.\nStart from Stage 1 -&gt; Stage 2 -&gt; Stage 3.\n\n\n\n\n\n\n\n\nTip\n\n\n\nUsing ZeRO Stage 3 with Single-GPU training\nZeRO Stage 3 can be used for training on a single GPU by manually setting the environment variables:\nWORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=0.0.0.0 MASTER_PORT=29500",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-fsdp",
-    "href": "docs/multi-gpu.html#sec-fsdp",
-    "title": "Multi-GPU",
-    "section": "3 Fully Sharded Data Parallel (FSDP)",
-    "text": "3 Fully Sharded Data Parallel (FSDP)\n\n\n\n\n\n\nNote\n\n\n\nFSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.\n\n\n\n3.1 Migrating from FSDP1 to FSDP2\nTo migrate your config from FSDP1 to FSDP2, you must use the fsdp_version top-level config field to specify the FSDP version, and\nalso follow the config field mapping below to update field names.\n\n3.1.1 Config mapping\n\n\n\nFSDP1\nFSDP2\n\n\n\n\nfsdp_sharding_strategy\nreshard_after_forward\n\n\nfsdp_backward_prefetch_policy\nREMOVED\n\n\nfsdp_backward_prefetch\nREMOVED\n\n\nfsdp_forward_prefetch\nREMOVED\n\n\nfsdp_sync_module_states\nREMOVED\n\n\nfsdp_cpu_ram_efficient_loading\ncpu_ram_efficient_loading\n\n\nfsdp_state_dict_type\nstate_dict_type\n\n\nfsdp_use_orig_params\nREMOVED\n\n\n\nFor more details, please see the migration guide in the torchtitan repo. In Axolotl,\nif you were using the following FSDP1 config:\nfsdp_version: 1\nfsdp_config:\n  fsdp_offload_params: false\n  fsdp_cpu_ram_efficient_loading: true\n  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_sharding_strategy: FULL_SHARD\nYou can migrate to the following FSDP2 config:\nfsdp_version: 2\nfsdp_config:\n  offload_params: false\n  cpu_ram_efficient_loading: true\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  state_dict_type: FULL_STATE_DICT\n  reshard_after_forward: true\n\n\n\n3.2 FSDP1 (deprecated)\n\n\n\n\n\n\nNote\n\n\n\nUsing fsdp to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use fsdp_config as above instead.\n\n\nfsdp:\n  - full_shard\n  - auto_wrap\nfsdp_config:\n  fsdp_offload_params: true\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-sequence-parallelism",
-    "href": "docs/multi-gpu.html#sec-sequence-parallelism",
-    "title": "Multi-GPU",
-    "section": "4 Sequence parallelism",
-    "text": "4 Sequence parallelism\nWe support sequence parallelism (SP) via the\nring-flash-attention project. This\nallows one to split up sequences across GPUs, which is useful in the event that a\nsingle sequence causes OOM errors during model training.\nSee our dedicated guide for more information.\n\n4.1 FSDP + QLoRA\nFor combining FSDP with QLoRA, see our dedicated guide.",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-performance",
-    "href": "docs/multi-gpu.html#sec-performance",
-    "title": "Multi-GPU",
-    "section": "5 Performance Optimization",
-    "text": "5 Performance Optimization\n\n5.1 Liger Kernel Integration\nPlease see docs for more info.",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-troubleshooting",
-    "href": "docs/multi-gpu.html#sec-troubleshooting",
-    "title": "Multi-GPU",
-    "section": "6 Troubleshooting",
-    "text": "6 Troubleshooting\n\n6.1 NCCL Issues\nFor NCCL-related problems, see our NCCL troubleshooting guide.\n\n\n6.2 Common Problems\n\nMemory IssuesTraining Instability\n\n\n\nReduce micro_batch_size\nReduce eval_batch_size\nAdjust gradient_accumulation_steps\nConsider using a higher ZeRO stage\n\n\n\n\nStart with DeepSpeed ZeRO-2\nMonitor loss values\nCheck learning rates\n\n\n\n\nFor more detailed troubleshooting, see our debugging guide.",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/torchao.html",
-    "href": "docs/torchao.html",
-    "title": "PyTorch ao",
-    "section": "",
-    "text": "To use experimental optimizers (AdamWFp8, AdamW4bit, AdamW8bit) from Pytorch Ao, please install the package as shown below.\n\n\n\n\n\n\nTip\n\n\n\nSome experimental optimizers are already present in regular Pytorch, so please re-check if you actually need this package!\n\n\n\nInstallation\nStable Release from the PyTorch index\npip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124\nNightly release\npip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124",
+    "text": "Axolotl enables training models at scale by composing different parallelism techniques. This is essential when:\n\nA model’s weights are too large to fit on a single GPU’s memory.\nA model’s activations, especially with very long contexts, are too large for a single GPU.\nYou want to accelerate training by using multiple GPUs or nodes.\n\nor combinations of the above!\n\n\nParallelism strategies can be combined. The key is understanding how each one divides the workload. PyTorch’s DeviceMesh is the modern way to manage these combinations, creating a logical grid of your GPUs and assigning different parallel strategies to different dimensions of the grid.\n\n\nData Parallelism focuses on splitting the global data batch across GPUs.\n\nDistributed Data Parallel (DDP): The classic approach. The full model is replicated on every GPU. Each GPU processes a different slice of the data batch. Gradients are then averaged across all GPUs after the backward pass to keep the models synchronized. This can substantially improve data throughput compared to single-device training, but requires that each GPU is able to hold the entire model, its gradients, and optimizer states.\nFully Sharded Data Parallel (FSDP): A highly memory-efficient form of data parallelism (inspired by DeepSpeed’s ZeRO). Instead of replicating the model, FSDP shards the model’s parameters, gradients, and optimizer states across the GPUs in the data-parallel group. During computation, each GPU receives the specific parameters it needs via an all_gather operation just before they are used, and they can be discarded immediately after (reshard-after-forward).\n\nFSDP maps to ZeRO stages:\n\nZeRO-2 (reshard_after_forward=False): Shards gradients and optimizer states. Model weights are replicated on each GPU.\nZeRO-3 (reshard_after_forward=True): Shards gradients, optimizer states, AND model parameters. This provides the most memory savings at the cost of more communication (re-gathering parameters for both forward and backward passes).\n\n\n\n\n\n\nAlso known as “horizontal model parallelism,” as described in the Megatron-LM paper. Instead of splitting the batch, TP splits the model’s layers themselves across GPUs.\n\nHow it works: For a linear layer Y = XA, the weight matrix A is split column-wise (A = [A_1, A_2]). The computation becomes Y_1 = XA_1 and Y_2 = XA_2, which can happen in parallel on different GPUs. The final output Y is simply the concatenation of Y_1 and Y_2. Check this comment for more detailed info.\nRequirement: TP involves frequent, small communications within a forward/backward pass. It requires a very fast interconnect between GPUs (e.g., NVLink) and is typically not recommended across different nodes.\n\n\n\n\nContext Parallelism, also called Sequence Parallelism, addresses the memory bottleneck from long sequences. The input sequence itself is split along the sequence length dimension and distributed across GPUs.\n\nHow it works: If you have a sequence of 8192 tokens and a context_parallel_size of 4, each GPU will only handle a chunk of 2048 tokens.\nThe Challenge: Attention is not local; every token needs to “attend to” every other token. Splitting the sequence breaks this.\nThe Solution (ring-flash-attention): An efficient communication protocol is used. To compute attention for its local sequence chunk, each GPU passes its Key-Value (KV) cache to its neighbor in a “ring.” After N-1 steps, every GPU has seen the KV-cache from all other GPUs, allowing it to compute the correct attention values for its chunk. This is implemented using the highly optimized flash-attention kernel at each step.\n\n\n\n\nHSDP is a 2D strategy that intelligently combines FSDP and DDP, typically for multi-node training.\n\nIntra-Node (within a machine): Use FSDP. This is efficient because GPUs on the same node have fast interconnects (NVLink), making the all_gather operations for sharded parameters fast.\nInter-Node (across machines): Use DDP. The gradient synchronization between nodes is less frequent than FSDP’s parameter gathering, making it a better fit for the slower node-to-node network (e.g., Ethernet/Infiniband).\nExample: With 2 nodes of 8 GPUs each (16 total), you could have dp_shard_size=8 (FSDP within each node) and dp_replicate_size=2 (DDP across the two nodes).\n\n\n\n\n\n# FSDP config. See https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp\nfsdp_version: 2\nfsdp_config:\n  # ...\n\n# The number of GPUs to shard the model parameters across (FSDP dimension).\ndp_shard_size: 4\n\n# The number of times to replicate the sharded model (DDP dimension).\ndp_replicate_size: 2\n\n# Number of GPUs for Tensor Parallelism.\ntensor_parallel_size: 1  # (default is 1, no TP)\n\n# Number of GPUs for Context/Sequence Parallelism.\ncontext_parallel_size: 1 # (default is 1, no CP)\nNote: We recommend FSDP. DeepSpeed is only compatible with tensor_parallel_size.\n\n\n\n\nHSDP on 2 nodes with 4 GPUs each (8 GPUs total):\n\nYou want FSDP within each node and DDP across nodes.\nSet dp_shard_size: 4 and dp_replicate_size: 2.\n\nFSDP + TP on a single 8-GPU node:\n\nYou want to split the model across 4 GPUs using FSDP, and further split each layer across 2 GPUs with TP.\nSet dp_shard_size: 4 and tensor_parallel_size: 2.\n\nFSDP + CP on a single 8-GPU node for long context:\n\nYou want to shard the model across all 8 GPUs and also split the sequence length across all 8 GPUs.\nSet dp_shard_size: 8 and context_parallel_size: 8. Note: this means the data parallel group and context parallel group are the same. A more common setup might be to shard across a smaller group.\n\n\n\n\n\nThis matrix describes how different parallelism methods can be combined in Axolotl.\n\n\n\n\n\n\n\n\n\n\n\nCombination\ndp_replicate_size\ndp_shard_size\ntp_size\ncp_size\nStatus & Notes\n\n\n\n\nFSDP (ZeRO-3)\n1\n&gt;1\n1\n1\n✅ Fully supported. Shards model across all GPUs.\n\n\nHSDP\n&gt;1\n&gt;1\n1\n1\n✅ Fully supported. FSDP intra-node, DDP inter-node.\n\n\nFSDP + TP\n1\n&gt;1\n&gt;1\n1\n✅ 2D Parallelism. Shards the model across a dp_shard group, and TP-splits layers within the tp group.\n\n\nHSDP + TP\n&gt;1\n&gt;1\n&gt;1\n1\n✅ 3D Parallelism. A powerful but complex combination.\n\n\nFSDP + CP\n1\n&gt;1\n1\n&gt;1\n✅ 2D Parallelism. Combines FSDP with context parallelism.\n\n\nFSDP + TP + CP\n1\n&gt;1\n&gt;1\n&gt;1\n✅ 3D Parallelism. Another advanced combination.\n\n\nDDP + TP/CP\n&gt;1\n1\n&gt;1\n&gt;1\n❌ Not Supported. The ParallelismConfig explicitly prevents this, as composing pure DDP with TP/CP without FSDP is inefficient and complex. You should use FSDP instead (dp_shard_size &gt; 1).\n\n\nJust TP / CP\n1\n1\n&gt;1\n&gt;1\n✅ Supported. Useful for inference or when the model fits on one GPU but context is too long.\n\n\n\n\ntp_size refers to tensor_parallel_size\ncp_size refers to context_parallel_size",
     "crumbs": [
       "Advanced Features",
-      "PyTorch ao"
+      "N-D Parallelism"
     ]
   },
   {
-    "objectID": "docs/cli.html",
-    "href": "docs/cli.html",
-    "title": "Command Line Interface (CLI)",
+    "objectID": "docs/nd_parallelism.html#core-concepts",
+    "href": "docs/nd_parallelism.html#core-concepts",
+    "title": "N-D Parallelism",
     "section": "",
-    "text": "The Axolotl CLI provides a streamlined interface for training and fine-tuning large language models. This guide covers\nthe CLI commands, their usage, and common examples.",
-    "crumbs": [
-      "Getting Started",
-      "Command Line Interface (CLI)"
-    ]
-  },
-  {
-    "objectID": "docs/cli.html#basic-commands",
-    "href": "docs/cli.html#basic-commands",
-    "title": "Command Line Interface (CLI)",
-    "section": "Basic Commands",
-    "text": "Basic Commands\nAll Axolotl commands follow this general structure:\naxolotl &lt;command&gt; [config.yml] [options]\nThe config file can be local or a URL to a raw YAML file.\n\nLauncher Arguments\nFor commands that support multi-GPU (train, evaluate, …), you can pass launcher-specific arguments using the -- separator:\n# Pass torchrun arguments\naxolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1\n\n# Pass accelerate arguments\naxolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml --num_processes=4\nArguments after -- are passed directly to the launcher (torchrun, accelerate launch, etc.).",
-    "crumbs": [
-      "Getting Started",
-      "Command Line Interface (CLI)"
-    ]
-  },
-  {
-    "objectID": "docs/cli.html#command-reference",
-    "href": "docs/cli.html#command-reference",
-    "title": "Command Line Interface (CLI)",
-    "section": "Command Reference",
-    "text": "Command Reference\n\nfetch\nDownloads example configurations and deepspeed configs to your local machine.\n# Get example YAML files\naxolotl fetch examples\n\n# Get deepspeed config files\naxolotl fetch deepspeed_configs\n\n# Specify custom destination\naxolotl fetch examples --dest path/to/folder\n\n\npreprocess\nPreprocesses and tokenizes your dataset before training. This is recommended for large datasets.\n# Basic preprocessing\naxolotl preprocess config.yml\n\n# Preprocessing with one GPU\nCUDA_VISIBLE_DEVICES=\"0\" axolotl preprocess config.yml\n\n# Debug mode to see processed examples\naxolotl preprocess config.yml --debug\n\n# Debug with limited examples\naxolotl preprocess config.yml --debug --debug-num-examples 5\nConfiguration options:\ndataset_prepared_path: Local folder for saving preprocessed data\npush_dataset_to_hub: HuggingFace repo to push preprocessed data (optional)\n\n\ntrain\nTrains or fine-tunes a model using the configuration specified in your YAML file.\n# Basic training\naxolotl train config.yml\n\n# Train and set/override specific options\naxolotl train config.yml \\\n    --learning-rate 1e-4 \\\n    --micro-batch-size 2 \\\n    --num-epochs 3\n\n# Training without accelerate\naxolotl train config.yml --launcher python\n\n# Pass launcher-specific arguments using -- separator\naxolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1\naxolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml\n\n# Resume training from checkpoint\naxolotl train config.yml --resume-from-checkpoint path/to/checkpoint\nIt is possible to run sweeps over multiple hyperparameters by passing in a sweeps config.\n# Basic training with sweeps\naxolotl train config.yml --sweep path/to/sweep.yaml\nExample sweep config:\n_:\n  # This section is for dependent variables we need to fix\n  - load_in_8bit: false\n    load_in_4bit: false\n    adapter: lora\n  - load_in_8bit: true\n    load_in_4bit: false\n    adapter: lora\n\n# These are independent variables\nlearning_rate: [0.0003, 0.0006]\nlora_r:\n  - 16\n  - 32\nlora_alpha:\n  - 16\n  - 32\n  - 64\n\n\ninference\nRuns inference using your trained model in either CLI or Gradio interface mode.\n# CLI inference with LoRA\naxolotl inference config.yml --lora-model-dir=\"./outputs/lora-out\"\n\n# CLI inference with full model\naxolotl inference config.yml --base-model=\"./completed-model\"\n\n# Gradio web interface\naxolotl inference config.yml --gradio \\\n    --lora-model-dir=\"./outputs/lora-out\"\n\n# Inference with input from file\ncat prompt.txt | axolotl inference config.yml \\\n    --base-model=\"./completed-model\"\n\n\nmerge-lora\nMerges trained LoRA adapters into the base model.\n# Basic merge\naxolotl merge-lora config.yml\n\n# Specify LoRA directory (usually used with checkpoints)\naxolotl merge-lora config.yml --lora-model-dir=\"./lora-output/checkpoint-100\"\n\n# Merge using CPU (if out of GPU memory)\nCUDA_VISIBLE_DEVICES=\"\" axolotl merge-lora config.yml\nConfiguration options:\ngpu_memory_limit: Limit GPU memory usage\nlora_on_cpu: Load LoRA weights on CPU\n\n\nmerge-sharded-fsdp-weights\nMerges sharded FSDP model checkpoints into a single combined checkpoint.\n# Basic merge\naxolotl merge-sharded-fsdp-weights config.yml\n\n\nevaluate\nEvaluates a model’s performance (loss etc) on the train and eval datasets.\n# Basic evaluation\naxolotl evaluate config.yml\n\n# Evaluation with launcher arguments\naxolotl evaluate config.yml --launcher torchrun -- --nproc_per_node=2\n\n\nlm-eval\nRuns LM Evaluation Harness on your model.\n# Basic evaluation\naxolotl lm-eval config.yml\nConfiguration options:\n# List of tasks to evaluate\nlm_eval_tasks:\n  - arc_challenge\n  - hellaswag\nlm_eval_batch_size: # Batch size for evaluation\noutput_dir: # Directory to save evaluation results\nSee LM Eval Harness for more details.\n\n\ndelinearize-llama4\nDelinearizes a Llama 4 linearized model into a regular HuggingFace Llama 4 model. This only works with the non-quantized linearized model.\naxolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir\nThis would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.\n\n\nquantize\nQuantizes a model using the quantization configuration specified in your YAML file.\naxolotl quantize config.yml\nSee Quantization for more details.",
-    "crumbs": [
-      "Getting Started",
-      "Command Line Interface (CLI)"
-    ]
-  },
-  {
-    "objectID": "docs/cli.html#legacy-cli-usage",
-    "href": "docs/cli.html#legacy-cli-usage",
-    "title": "Command Line Interface (CLI)",
-    "section": "Legacy CLI Usage",
-    "text": "Legacy CLI Usage\nWhile the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:\n# Preprocess\npython -m axolotl.cli.preprocess config.yml\n\n# Train\naccelerate launch -m axolotl.cli.train config.yml\n\n# Inference\naccelerate launch -m axolotl.cli.inference config.yml \\\n    --lora_model_dir=\"./outputs/lora-out\"\n\n# Gradio interface\naccelerate launch -m axolotl.cli.inference config.yml \\\n    --lora_model_dir=\"./outputs/lora-out\" --gradio\n\n\n\n\n\n\nImportant\n\n\n\nWhen overriding CLI parameters in the legacy CLI, use same notation as in yaml file (e.g., --lora_model_dir).\nNote: This differs from the new Click-based CLI, which uses dash notation (e.g., --lora-model-dir). Keep this in mind if you’re referencing newer documentation or switching between CLI versions.",
-    "crumbs": [
-      "Getting Started",
-      "Command Line Interface (CLI)"
-    ]
-  },
-  {
-    "objectID": "docs/cli.html#remote-compute-with-modal-cloud",
-    "href": "docs/cli.html#remote-compute-with-modal-cloud",
-    "title": "Command Line Interface (CLI)",
-    "section": "Remote Compute with Modal Cloud",
-    "text": "Remote Compute with Modal Cloud\nAxolotl supports running training and inference workloads on Modal cloud infrastructure. This is configured using a\ncloud YAML file alongside your regular Axolotl config.\n\nCloud Configuration\nCreate a cloud config YAML with your Modal settings:\n# cloud_config.yml\nprovider: modal\ngpu: a100       # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4\ngpu_count: 1    # Number of GPUs to use\ntimeout: 86400  # Maximum runtime in seconds (24 hours)\nbranch: main    # Git branch to use (optional)\n\nvolumes:        # Persistent storage volumes\n  - name: axolotl-cache\n    mount: /workspace/cache\n  - name: axolotl-data\n    mount: /workspace/data\n  - name: axolotl-artifacts\n    mount: /workspace/artifacts\n\nsecrets:        # Secrets to inject\n  - WANDB_API_KEY\n  - HF_TOKEN\n\n\nRunning on Modal Cloud\nCommands that support the –cloud flag:\n# Preprocess on cloud\naxolotl preprocess config.yml --cloud cloud_config.yml\n\n# Train on cloud\naxolotl train config.yml --cloud cloud_config.yml\n\n# Run lm-eval on cloud\naxolotl lm-eval config.yml --cloud cloud_config.yml\n\n\nCloud Configuration Options\nprovider:    # compute provider, currently only `modal` is supported\ngpu:         # GPU type to use\ngpu_count:   # Number of GPUs (default: 1)\nmemory:      # RAM in GB (default: 128)\ntimeout:     # Maximum runtime in seconds\ntimeout_preprocess: # Preprocessing timeout\nbranch:      # Git branch to use\ndocker_tag:  # Custom Docker image tag\nvolumes:     # List of persistent storage volumes\n\n# Environment variables to pass. Can be specified in two ways:\n# 1. As a string: Will load the value from the host computer's environment variables\n# 2. As a key-value pair: Will use the specified value directly\n# Example:\n# env:\n#   - CUSTOM_VAR  # Loads from host's $CUSTOM_VAR\n#   - {CUSTOM_VAR: \"value\"}  # Uses \"value\" directly\nenv:\n\n# Secrets to inject. Same input format as `env` but for sensitive data.\nsecrets:\n  # - HF_TOKEN\n  # - WANDB_API_KEY",
-    "crumbs": [
-      "Getting Started",
-      "Command Line Interface (CLI)"
-    ]
-  },
-  {
-    "objectID": "docs/nccl.html",
-    "href": "docs/nccl.html",
-    "title": "NCCL",
-    "section": "",
-    "text": "NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several environment variables. A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:\nWatchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.\nOften, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends disabling PCI access control services (ACS) as a possible solution if this is available to you.\nForcing cross-GPU communication via NVLink may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:\nnvidia-smi nvlink --status\nTo force NCCL to use NVLink, simply set this in the environment:\nexport NCCL_P2P_LEVEL=NVL\nIf NVLink is not available in your environment there are other options for NCCL_P2P_LEVEL in the table below:\n\n\n\n\n\n\n\nNCCL_P2P_LEVEL\nDescription\n\n\n\n\nPIX\nP2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication.\n\n\nPXB\nP2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency.\n\n\nPHB\nP2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL)\n\n\n\nTo validate that acceptable data transfer speeds exist for your training job, running NCCL Tests can help pinpoint bottlenecks, for example:\n./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3\nIt can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:\nexport NCCL_DEBUG=INFO\nexport NCCL_DEBUG_SUBSYS=ALL\nexport TORCH_DISTRIBUTED_DEBUG=INFO\nexport TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log\nFinally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ddp_timeout value in the Axolotl configuration. See PyTorch init_process_group for documentation on this value.",
-    "crumbs": [
-      "Troubleshooting",
-      "NCCL"
-    ]
-  },
-  {
-    "objectID": "docs/dataset_preprocessing.html",
-    "href": "docs/dataset_preprocessing.html",
-    "title": "Dataset Preprocessing",
-    "section": "",
-    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
-    "crumbs": [
-      "Core Concepts",
-      "Dataset Preprocessing"
-    ]
-  },
-  {
-    "objectID": "docs/dataset_preprocessing.html#overview",
-    "href": "docs/dataset_preprocessing.html#overview",
-    "title": "Dataset Preprocessing",
-    "section": "",
-    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
-    "crumbs": [
-      "Core Concepts",
-      "Dataset Preprocessing"
-    ]
-  },
-  {
-    "objectID": "docs/faq.html",
-    "href": "docs/faq.html",
-    "title": "FAQ",
-    "section": "",
-    "text": "General\nQ: The trainer stopped and hasn’t progressed in several minutes.\n\nA: Usually an issue with the GPUs communicating with each other. See the NCCL doc\n\nQ: exitcode: -9\n\nA: This usually happens when you run out of system RAM.\n\nQ: exitcode: -7 while using deepspeed\n\nA: Try upgrading deepspeed w: pip install -U deepspeed\n\nQ: AttributeError: ‘DummyOptim’ object has no attribute ‘step’\nQ: ModuleNotFoundError: No module named ‘mpi4py’ using single GPU with deepspeed\n\nA: You may be using deepspeed with single gpu. Please remove the deepspeed: section in the yaml file or --deepspeed CLI flag.\n\nQ: The codes is stuck on saving preprocessed datasets.\n\nA: This is usually an issue with the GPU. This can be resolved through setting the os environment variable CUDA_VISIBLE_DEVICES=0. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.\n\nQ: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.\n\nA: This is likely due to vocab size mismatch. By default, Axolotl expands the model’s embeddings if the tokenizer has more tokens than the model. Please use the axolotl merge-lora command to merge the adapters instead of using your own scripts.\n\n\nOn the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model’s embeddings unless shrink_embeddings: true is set in the config.\n\nQ: How to call Axolotl via custom python scripts?\n\nA: Since Axolotl is just Python, please see src/axolotl/cli/main.py on how each command is called.\n\nQ: How to know the value to use for fsdp_transformer_layer_cls_to_wrap?\n\nA: This is the class name of the transformer layer to wrap with FSDP. For example, for LlamaForCausalLM, the value is LlamaDecoderLayer. To find this for a specific model, check the model’s PreTrainedModel definition and look for _no_split_modules variable in the modeling_&lt;model_name&gt;.py file within transformers library.\n\nQ: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token\n\nA: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:\n\n\nspecial_tokens:\n  # str. If you're not sure, set to same as `eos_token`.\n  pad_token: \"...\"\n\nQ: IterableDataset error or KeyError: 'input_ids' when using preprocess CLI\n\nA: This is because you may be using preprocess CLI with pretraining_dataset: or skip_prepare_dataset: true respectively. Please use axolotl train CLI directly instead as these datasets are prepared on demand.\n\nQ: vLLM is not working with Axolotl\n\nA: We currently recommend torch 2.6.0 for use with vllm. Please ensure you use the right version. For Docker, please use the main-py3.11-cu124-2.6.0 tag.\n\nQ: FA2 2.8.0 undefined symbol runtime error on CUDA 12.4\n\nA: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.\n\n\n\nChat templates\nQ: jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____\n\nA: This means that the property mapping for the stated attribute does not exist when building chat_template prompt. For example, if no attribute 'content', please check you have added the correct mapping for content under message_property_mappings.\n\nQ: Empty template generated for turn ___\n\nA: The content is empty for that turn.\n\nQ: Could not find content start/end boundary for turn __\n\nA: The specific turn’s start/end could not be detected. Please ensure you have set the eos_token following your chat_template. Otherwise, this could be a chat_template which doesn’t use proper boundaries for each turn (like system). On the rare occurrence, make sure your content is not [[dummy_message]]. Please let us know about this.\n\nQ: Content end boundary is before start boundary for turn ___\n\nA: This is an edge case which should not occur. Please create an Issue if this happens.\n\nQ: Content end boundary is the same as start boundary for turn ___. This is likely an empty turn.\n\nA: This is likely an empty turn.\n\nQ: The EOS token is incorrectly being masked or not being masked / EOS token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThis is because of the mismatch between tokenizer.eos_token and EOS token in template. Please make sure to set eos_token: under special_tokens: to the same EOS token as in template.\n\n\n\n\nThe EOS token is not in the template. Please check if your template is correct. As an example, phi_35 template does not use its dedicated EOS token &lt;|endoftext|&gt; at the end.\n\n\nQ: “chat_template choice is tokenizer_default but tokenizer’s chat_template is null. Please add a chat_template in tokenizer config”\n\nA: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See chat_template for more details.\n\nQ: The EOT token(s) are incorrectly being masked or not being masked / EOT token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThe EOT token is different from the EOS token and was not specified under eot_tokens:. Please set eot_tokens: to the same EOT token(s) as in template.\n\n\n\n\nThere is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.\n\n\nQ: EOT token encoding failed. Please check if the token is valid and can be encoded.\n\nA: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.\n\nQ: EOT token __ is encoded as multiple tokens.\n\nA: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:.\n\nQ: Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot\n\nA: This is because the EOS token is in the eot_tokens: while mismatch between train_on_eos: and train_on_eot:. This will cause one to override the other. Please ensure that train_on_eos: and train_on_eot: are the same or remove the EOS token from eot_tokens:.\n\nQ: If eot_tokens: is not provided, what happens?\n\nA: If eot_tokens: is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.\n\n\nInternally, eot_tokens: tokenizer.eos_token and train_on_eot: train_on_eos (which defaults to turn). This transition helps clarify the naming and behavior of EOT/EOS tokens.\n\nQ: Data processing error: CAS service error\n\nA: Try disabling XET with export HF_HUB_DISABLE_XET=1\n\nQ: torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice.\n\nA: Depending on the version of torch, you may need to include this in your YAML:\n\n\nflex_attn_compile_kwargs:\n  dynamic: false\n  mode: max-autotune-no-cudagraphs\n\n**Q: ValueError(\"Backward pass should have cleared tracker of all tensors\")\n\nA: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with offload_activations: legacy in your YAML.",
-    "crumbs": [
-      "Troubleshooting",
-      "FAQ"
-    ]
-  },
-  {
-    "objectID": "docs/qat.html",
-    "href": "docs/qat.html",
-    "title": "Quantization Aware Training (QAT)",
-    "section": "",
-    "text": "Quantization Aware Training (QAT) is a technique for improving the accuracy of models which are quantized\nby applying “fake” quantizations to the model’s weights (and optionally, activations) during training. This fake\nquantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually\nquantized, the accuracy loss is minimized. We use the quantization techniques implemented in torchao to provide\nsupport for QAT and post-training quantization (PTQ) in axolotl.\nWe recommend reviewing the excellent QAT tutorial in the torchtune library,\nand the QAT documentation in the torchao library, for more details.",
-    "crumbs": [
-      "How To Guides",
-      "Quantization Aware Training (QAT)"
-    ]
-  },
-  {
-    "objectID": "docs/qat.html#overview",
-    "href": "docs/qat.html#overview",
-    "title": "Quantization Aware Training (QAT)",
-    "section": "",
-    "text": "Quantization Aware Training (QAT) is a technique for improving the accuracy of models which are quantized\nby applying “fake” quantizations to the model’s weights (and optionally, activations) during training. This fake\nquantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually\nquantized, the accuracy loss is minimized. We use the quantization techniques implemented in torchao to provide\nsupport for QAT and post-training quantization (PTQ) in axolotl.\nWe recommend reviewing the excellent QAT tutorial in the torchtune library,\nand the QAT documentation in the torchao library, for more details.",
-    "crumbs": [
-      "How To Guides",
-      "Quantization Aware Training (QAT)"
-    ]
-  },
-  {
-    "objectID": "docs/qat.html#configuring-qat-in-axolotl",
-    "href": "docs/qat.html#configuring-qat-in-axolotl",
-    "title": "Quantization Aware Training (QAT)",
-    "section": "Configuring QAT in Axolotl",
-    "text": "Configuring QAT in Axolotl\nTo enable QAT in axolotl, add the following to your configuration file:\nqat:\n  activation_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for activation quantization. Valid options are \"int4\" and \"int8\"\n  weight_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for weight quantization. Valid options are \"int4\" and \"int8\"\n  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization\n  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after\nOnce you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the quantize command to do this.",
-    "crumbs": [
-      "How To Guides",
-      "Quantization Aware Training (QAT)"
-    ]
-  },
-  {
-    "objectID": "docs/gradient_checkpointing.html",
-    "href": "docs/gradient_checkpointing.html",
-    "title": "Gradient Checkpointing and Activation Offloading",
-    "section": "",
-    "text": "Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning\nmodels by reducing the memory footprint and improving computational efficiency.\n\nEnabling Gradient Checkpointing\ngradient_checkpointing: true\n\n\nEnabling Activation Offloading\ngradient_checkpointing: true  # required for activation offloading\nactivation_offloading: true\nActivation offloading variants:\nThe default activation_offloading: true offloads activations to CPU and uses CUDA streams\nto overlap the communications and computations when offloading.\nThe activation_offloading: legacy naively offloads activations to CPU and without additional optimizations.\nFor resource constrained environments with limited CPU memory, activation_offloading: disk offloads\nactivations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.",
+    "text": "Parallelism strategies can be combined. The key is understanding how each one divides the workload. PyTorch’s DeviceMesh is the modern way to manage these combinations, creating a logical grid of your GPUs and assigning different parallel strategies to different dimensions of the grid.\n\n\nData Parallelism focuses on splitting the global data batch across GPUs.\n\nDistributed Data Parallel (DDP): The classic approach. The full model is replicated on every GPU. Each GPU processes a different slice of the data batch. Gradients are then averaged across all GPUs after the backward pass to keep the models synchronized. This can substantially improve data throughput compared to single-device training, but requires that each GPU is able to hold the entire model, its gradients, and optimizer states.\nFully Sharded Data Parallel (FSDP): A highly memory-efficient form of data parallelism (inspired by DeepSpeed’s ZeRO). Instead of replicating the model, FSDP shards the model’s parameters, gradients, and optimizer states across the GPUs in the data-parallel group. During computation, each GPU receives the specific parameters it needs via an all_gather operation just before they are used, and they can be discarded immediately after (reshard-after-forward).\n\nFSDP maps to ZeRO stages:\n\nZeRO-2 (reshard_after_forward=False): Shards gradients and optimizer states. Model weights are replicated on each GPU.\nZeRO-3 (reshard_after_forward=True): Shards gradients, optimizer states, AND model parameters. This provides the most memory savings at the cost of more communication (re-gathering parameters for both forward and backward passes).\n\n\n\n\n\n\nAlso known as “horizontal model parallelism,” as described in the Megatron-LM paper. Instead of splitting the batch, TP splits the model’s layers themselves across GPUs.\n\nHow it works: For a linear layer Y = XA, the weight matrix A is split column-wise (A = [A_1, A_2]). The computation becomes Y_1 = XA_1 and Y_2 = XA_2, which can happen in parallel on different GPUs. The final output Y is simply the concatenation of Y_1 and Y_2. Check this comment for more detailed info.\nRequirement: TP involves frequent, small communications within a forward/backward pass. It requires a very fast interconnect between GPUs (e.g., NVLink) and is typically not recommended across different nodes.\n\n\n\n\nContext Parallelism, also called Sequence Parallelism, addresses the memory bottleneck from long sequences. The input sequence itself is split along the sequence length dimension and distributed across GPUs.\n\nHow it works: If you have a sequence of 8192 tokens and a context_parallel_size of 4, each GPU will only handle a chunk of 2048 tokens.\nThe Challenge: Attention is not local; every token needs to “attend to” every other token. Splitting the sequence breaks this.\nThe Solution (ring-flash-attention): An efficient communication protocol is used. To compute attention for its local sequence chunk, each GPU passes its Key-Value (KV) cache to its neighbor in a “ring.” After N-1 steps, every GPU has seen the KV-cache from all other GPUs, allowing it to compute the correct attention values for its chunk. This is implemented using the highly optimized flash-attention kernel at each step.\n\n\n\n\nHSDP is a 2D strategy that intelligently combines FSDP and DDP, typically for multi-node training.\n\nIntra-Node (within a machine): Use FSDP. This is efficient because GPUs on the same node have fast interconnects (NVLink), making the all_gather operations for sharded parameters fast.\nInter-Node (across machines): Use DDP. The gradient synchronization between nodes is less frequent than FSDP’s parameter gathering, making it a better fit for the slower node-to-node network (e.g., Ethernet/Infiniband).\nExample: With 2 nodes of 8 GPUs each (16 total), you could have dp_shard_size=8 (FSDP within each node) and dp_replicate_size=2 (DDP across the two nodes).",
     "crumbs": [
       "Advanced Features",
-      "Gradient Checkpointing and Activation Offloading"
+      "N-D Parallelism"
     ]
   },
   {
-    "objectID": "docs/input_output.html",
-    "href": "docs/input_output.html",
-    "title": "Template-free prompt construction",
+    "objectID": "docs/nd_parallelism.html#usage",
+    "href": "docs/nd_parallelism.html#usage",
+    "title": "N-D Parallelism",
     "section": "",
-    "text": "The documentation moved to here."
-  },
-  {
-    "objectID": "src/axolotl/integrations/LICENSE.html",
-    "href": "src/axolotl/integrations/LICENSE.html",
-    "title": "Axolotl",
-    "section": "",
-    "text": "AXOLOTL COMMUNITY LICENSE AGREEMENT\nThis Axolotl Community License Agreement (“Agreement”) is entered into by and between Axolotl AI Corp. (“Axolotl”) and\nany individual or entity (“Licensee”) who wishes to use the Software (as defined below) in accordance with the terms\nand conditions set forth in this Agreement.\n\nDefinitions\n1.1 “Licensee” refers to any individual or entity who has obtained a copy of the Software under this Agreement.\n1.2 “Plugin Integration” means independent integration software modules which may or may not be offered by Axolotl,\nwhich may be licensed separately by their respective authors and/or licensors.\n1.3 “Software” refers to the specific sub-directory of the Axolotl, Inc. software located at\nhttps://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations and its subdirectories which\npermits Plugin Integrations to integrate with the Axolotl service.\nGrant of License\n2.1 Axolotl hereby grants Licensee a worldwide, non-exclusive, royalty-free, license to use, copy, modify, merge,\npublish, distribute, sublicense, and/or otherwise exploit the Software, subject to the following conditions:\n- Licensee must comply with all the terms and conditions of this Agreement.\n- Licensee must include the original copyright notice and disclaimer of warranty in all copies or substantial\nportions of the Software.\n2.2 Licensee may use the Software for any lawful purpose, except as restricted in Section 3.\nRestrictions\n3.1 Licensee shall not use the Software for any activity that constitutes a commercial activity of offering for\nfree or for sale any services, platform, or equivalent to third parties for the purposes of allowing such\nthird parties to fine-tune artificial intelligence models.\n3.2 Licensee shall not:\n- Use the Software for any illegal or unauthorized purpose.\n- Reverse engineer, decompile, or disassemble the Software.\n- Remove or modify any copyright, trademark, or other proprietary notices contained in the Software.\n- Use the Software in a way that could damage, disable, overburden, or impair the functionality of the\nSoftware or interfere with any third-party use of the Software.\n3.3 Axolotl reserves the right to restrict certain Plugin Integrations for use with the Software. To the extent Licensee integrates a permitted, applicable Plugin Integration with the Software, Licensee shall comply with any additional terms and conditions imposed by the licensors of such Plugin Integration for use of such Plugin Integrations. Licensee shall contact Axolotl if it has questions about whether its use of the Software falls beyond the scope of this Agreement.\nIntellectual Property Rights\n4.1 Axolotl and its contributors retain all intellectual property rights in and to the Software. Licensee\nacknowledges that this Agreement does not transfer any ownership rights or intellectual property rights to\nLicensee.\nDisclaimer of Warranty\n5.1 THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED\nTO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL\nTHE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF\nCONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\nDEALINGS IN THE SOFTWARE.\nTermination\n6.1 Axolotl may terminate this Agreement at any time if Licensee fails to comply with any of the terms and\nconditions set forth herein. Upon termination, Licensee shall cease all use of the Software and destroy any\ncopies in its possession.\nGoverning Law\n7.1 This Agreement shall be governed by and construed in accordance with the laws of the State of California,\nwithout regards to conflicts of laws provisions thereof.\nEntire Agreement\n8.1 This Agreement constitutes the entire agreement between Axolotl and Licensee with respect to the subject matter\nhereof and supersedes all prior or contemporaneous understandings or agreements between the parties concerning\nthe Software, whether written or oral. Axolotl may update the terms of this Agreement from time to time, and\nLicensee’s continued use of the Software after any such updates shall constitute acceptance of updated terms\non a go-forward basis. Axolotl will use commercially reasonable efforts to provide Licensee notice of any\nmaterial updates. By using the Software, Licensee acknowledges that it has read, understood, and agrees to be\nbound by the terms and conditions of this Agreement.\n\nThis Agreement was last updated on August 23, 2024."
-  },
-  {
-    "objectID": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html",
-    "href": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html",
-    "title": "Axolotl",
-    "section": "",
-    "text": "Acknowledgements\nPortions of this Cut Cross Entropy Software may utilize the following copyrighted\nmaterial, the use of which is hereby acknowledged.\n\nPyTorch\nFrom PyTorch:\n\nCopyright (c) 2016-     Facebook, Inc            (Adam Paszke)\nCopyright (c) 2014-     Facebook, Inc            (Soumith Chintala)\nCopyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)\nCopyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)\nCopyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)\nCopyright (c) 2011-2013 NYU                      (Clement Farabet)\nCopyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)\nCopyright (c) 2006      Idiap Research Institute (Samy Bengio)\nCopyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)\n\nFrom Caffe2:\n\nCopyright (c) 2016-present, Facebook Inc. All rights reserved.\n\nAll contributions by Facebook:\nCopyright (c) 2016 Facebook Inc.\n\nAll contributions by Google:\nCopyright (c) 2015 Google Inc.\nAll rights reserved.\n\nAll contributions by Yangqing Jia:\nCopyright (c) 2015 Yangqing Jia\nAll rights reserved.\n\nAll contributions by Kakao Brain:\nCopyright 2019-2020 Kakao Brain\n\nAll contributions by Cruise LLC:\nCopyright (c) 2022 Cruise LLC.\nAll rights reserved.\n\nAll contributions by Arm:\nCopyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates\n\nAll contributions from Caffe:\nCopyright(c) 2013, 2014, 2015, the respective contributors\nAll rights reserved.\n\nAll other contributions:\nCopyright(c) 2015, 2016 the respective contributors\nAll rights reserved.\n\nCaffe2 uses a copyright model similar to Caffe: each contributor holds\ncopyright over their contributions to Caffe2. The project versioning records\nall such contribution and copyright details. If a contributor wants to further\nmark their specific copyright on a particular contribution, they should\nindicate their copyright solely in the commit message of the change when it is\ncommitted.\n\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n1. Redistributions of source code must retain the above copyright\nnotice, this list of conditions and the following disclaimer.\n\n2. Redistributions in binary form must reproduce the above copyright\nnotice, this list of conditions and the following disclaimer in the\ndocumentation and/or other materials provided with the distribution.\n\n3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America\nand IDIAP Research Institute nor the names of its contributors may be\nused to endorse or promote products derived from this software without\nspecific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\nARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\nLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\nCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\nSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\nINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\nCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\nARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\nPOSSIBILITY OF SUCH DAMAGE.\nTriton\n/*\n* Copyright 2018-2020 Philippe Tillet\n* Copyright 2020-2022 OpenAI\n*\n* Permission is hereby granted, free of charge, to any person obtaining\n* a copy of this software and associated documentation files\n* (the \"Software\"), to deal in the Software without restriction,\n* including without limitation the rights to use, copy, modify, merge,\n* publish, distribute, sublicense, and/or sell copies of the Software,\n* and to permit persons to whom the Software is furnished to do so,\n* subject to the following conditions:\n*\n* The above copyright notice and this permission notice shall be\n* included in all copies or substantial portions of the Software.\n*\n* THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\n* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n*/\nTransformers\nCopyright 2018- The Hugging Face team. All rights reserved.\n\n                                Apache License\n                        Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\nTERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n1. Definitions.\n\n    \"License\" shall mean the terms and conditions for use, reproduction,\n    and distribution as defined by Sections 1 through 9 of this document.\n\n    \"Licensor\" shall mean the copyright owner or entity authorized by\n    the copyright owner that is granting the License.\n\n    \"Legal Entity\" shall mean the union of the acting entity and all\n    other entities that control, are controlled by, or are under common\n    control with that entity. For the purposes of this definition,\n    \"control\" means (i) the power, direct or indirect, to cause the\n    direction or management of such entity, whether by contract or\n    otherwise, or (ii) ownership of fifty percent (50%) or more of the\n    outstanding shares, or (iii) beneficial ownership of such entity.\n\n    \"You\" (or \"Your\") shall mean an individual or Legal Entity\n    exercising permissions granted by this License.\n\n    \"Source\" form shall mean the preferred form for making modifications,\n    including but not limited to software source code, documentation\n    source, and configuration files.\n\n    \"Object\" form shall mean any form resulting from mechanical\n    transformation or translation of a Source form, including but\n    not limited to compiled object code, generated documentation,\n    and conversions to other media types.\n\n    \"Work\" shall mean the work of authorship, whether in Source or\n    Object form, made available under the License, as indicated by a\n    copyright notice that is included in or attached to the work\n    (an example is provided in the Appendix below).\n\n    \"Derivative Works\" shall mean any work, whether in Source or Object\n    form, that is based on (or derived from) the Work and for which the\n    editorial revisions, annotations, elaborations, or other modifications\n    represent, as a whole, an original work of authorship. For the purposes\n    of this License, Derivative Works shall not include works that remain\n    separable from, or merely link (or bind by name) to the interfaces of,\n    the Work and Derivative Works thereof.\n\n    \"Contribution\" shall mean any work of authorship, including\n    the original version of the Work and any modifications or additions\n    to that Work or Derivative Works thereof, that is intentionally\n    submitted to Licensor for inclusion in the Work by the copyright owner\n    or by an individual or Legal Entity authorized to submit on behalf of\n    the copyright owner. For the purposes of this definition, \"submitted\"\n    means any form of electronic, verbal, or written communication sent\n    to the Licensor or its representatives, including but not limited to\n    communication on electronic mailing lists, source code control systems,\n    and issue tracking systems that are managed by, or on behalf of, the\n    Licensor for the purpose of discussing and improving the Work, but\n    excluding communication that is conspicuously marked or otherwise\n    designated in writing by the copyright owner as \"Not a Contribution.\"\n\n    \"Contributor\" shall mean Licensor and any individual or Legal Entity\n    on behalf of whom a Contribution has been received by Licensor and\n    subsequently incorporated within the Work.\n\n2. Grant of Copyright License. Subject to the terms and conditions of\n    this License, each Contributor hereby grants to You a perpetual,\n    worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n    copyright license to reproduce, prepare Derivative Works of,\n    publicly display, publicly perform, sublicense, and distribute the\n    Work and such Derivative Works in Source or Object form.\n\n3. Grant of Patent License. Subject to the terms and conditions of\n    this License, each Contributor hereby grants to You a perpetual,\n    worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n    (except as stated in this section) patent license to make, have made,\n    use, offer to sell, sell, import, and otherwise transfer the Work,\n    where such license applies only to those patent claims licensable\n    by such Contributor that are necessarily infringed by their\n    Contribution(s) alone or by combination of their Contribution(s)\n    with the Work to which such Contribution(s) was submitted. If You\n    institute patent litigation against any entity (including a\n    cross-claim or counterclaim in a lawsuit) alleging that the Work\n    or a Contribution incorporated within the Work constitutes direct\n    or contributory patent infringement, then any patent licenses\n    granted to You under this License for that Work shall terminate\n    as of the date such litigation is filed.\n\n4. Redistribution. You may reproduce and distribute copies of the\n    Work or Derivative Works thereof in any medium, with or without\n    modifications, and in Source or Object form, provided that You\n    meet the following conditions:\n\n    (a) You must give any other recipients of the Work or\n        Derivative Works a copy of this License; and\n\n    (b) You must cause any modified files to carry prominent notices\n        stating that You changed the files; and\n\n    (c) You must retain, in the Source form of any Derivative Works\n        that You distribute, all copyright, patent, trademark, and\n        attribution notices from the Source form of the Work,\n        excluding those notices that do not pertain to any part of\n        the Derivative Works; and\n\n    (d) If the Work includes a \"NOTICE\" text file as part of its\n        distribution, then any Derivative Works that You distribute must\n        include a readable copy of the attribution notices contained\n        within such NOTICE file, excluding those notices that do not\n        pertain to any part of the Derivative Works, in at least one\n        of the following places: within a NOTICE text file distributed\n        as part of the Derivative Works; within the Source form or\n        documentation, if provided along with the Derivative Works; or,\n        within a display generated by the Derivative Works, if and\n        wherever such third-party notices normally appear. The contents\n        of the NOTICE file are for informational purposes only and\n        do not modify the License. You may add Your own attribution\n        notices within Derivative Works that You distribute, alongside\n        or as an addendum to the NOTICE text from the Work, provided\n        that such additional attribution notices cannot be construed\n        as modifying the License.\n\n    You may add Your own copyright statement to Your modifications and\n    may provide additional or different license terms and conditions\n    for use, reproduction, or distribution of Your modifications, or\n    for any such Derivative Works as a whole, provided Your use,\n    reproduction, and distribution of the Work otherwise complies with\n    the conditions stated in this License.\n\n5. Submission of Contributions. Unless You explicitly state otherwise,\n    any Contribution intentionally submitted for inclusion in the Work\n    by You to the Licensor shall be under the terms and conditions of\n    this License, without any additional terms or conditions.\n    Notwithstanding the above, nothing herein shall supersede or modify\n    the terms of any separate license agreement you may have executed\n    with Licensor regarding such Contributions.\n\n6. Trademarks. This License does not grant permission to use the trade\n    names, trademarks, service marks, or product names of the Licensor,\n    except as required for reasonable and customary use in describing the\n    origin of the Work and reproducing the content of the NOTICE file.\n\n7. Disclaimer of Warranty. Unless required by applicable law or\n    agreed to in writing, Licensor provides the Work (and each\n    Contributor provides its Contributions) on an \"AS IS\" BASIS,\n    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n    implied, including, without limitation, any warranties or conditions\n    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n    PARTICULAR PURPOSE. You are solely responsible for determining the\n    appropriateness of using or redistributing the Work and assume any\n    risks associated with Your exercise of permissions under this License.\n\n8. Limitation of Liability. In no event and under no legal theory,\n    whether in tort (including negligence), contract, or otherwise,\n    unless required by applicable law (such as deliberate and grossly\n    negligent acts) or agreed to in writing, shall any Contributor be\n    liable to You for damages, including any direct, indirect, special,\n    incidental, or consequential damages of any character arising as a\n    result of this License or out of the use or inability to use the\n    Work (including but not limited to damages for loss of goodwill,\n    work stoppage, computer failure or malfunction, or any and all\n    other commercial damages or losses), even if such Contributor\n    has been advised of the possibility of such damages.\n\n9. Accepting Warranty or Additional Liability. While redistributing\n    the Work or Derivative Works thereof, You may choose to offer,\n    and charge a fee for, acceptance of support, warranty, indemnity,\n    or other liability obligations and/or rights consistent with this\n    License. However, in accepting such obligations, You may act only\n    on Your own behalf and on Your sole responsibility, not on behalf\n    of any other Contributor, and only if You agree to indemnify,\n    defend, and hold each Contributor harmless for any liability\n    incurred by, or claims asserted against, such Contributor by reason\n    of your accepting any such warranty or additional liability.\n\nEND OF TERMS AND CONDITIONS\n\nAPPENDIX: How to apply the Apache License to your work.\n\n    To apply the Apache License to your work, attach the following\n    boilerplate notice, with the fields enclosed by brackets \"[]\"\n    replaced with your own identifying information. (Don't include\n    the brackets!)  The text should be enclosed in the appropriate\n    comment syntax for the file format. We also recommend that a\n    file or class name and description of purpose be included on the\n    same \"printed page\" as the copyright notice for easier\n    identification within third-party archives.\n\nCopyright [yyyy] [name of copyright owner]\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n    http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License."
-  },
-  {
-    "objectID": "docs/mac.html",
-    "href": "docs/mac.html",
-    "title": "Mac M-series",
-    "section": "",
-    "text": "Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.\nCurrent support:\n\nSupport for all models\nFull training of models\nLoRA training\nSample packing\nFP16 and BF16 (awaiting AMP support for MPS in Pytorch)\nTri-dao’s flash-attn (until it is supported use spd_attention as an alternative)\nxformers\nbitsandbytes (meaning no 4/8 bits loading and bnb optimizers)\nqlora\nDeepSpeed\n\nUntested:\n\nFSDP",
-    "crumbs": [
-      "Deployments",
-      "Mac M-series"
-    ]
-  },
-  {
-    "objectID": "docs/lr_groups.html",
-    "href": "docs/lr_groups.html",
-    "title": "Learning Rate Groups",
-    "section": "",
-    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
-    "crumbs": [
-      "How To Guides",
-      "Learning Rate Groups"
-    ]
-  },
-  {
-    "objectID": "docs/lr_groups.html#background",
-    "href": "docs/lr_groups.html#background",
-    "title": "Learning Rate Groups",
-    "section": "",
-    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
-    "crumbs": [
-      "How To Guides",
-      "Learning Rate Groups"
-    ]
-  },
-  {
-    "objectID": "docs/lr_groups.html#example",
-    "href": "docs/lr_groups.html#example",
-    "title": "Learning Rate Groups",
-    "section": "Example",
-    "text": "Example\nlr_groups:\n  - name: o_proj\n    modules:\n      - self_attn.o_proj.weight\n    lr: 1e-6\n  - name: q_proj\n    modules:\n      - model.layers.2.self_attn.q_proj.weight\n    lr: 1e-5\n\nlearning_rate: 2e-5\nIn this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate\nof 1e-6 for all the self attention o_proj modules across all layers, and a learning are of 1e-5 to the 3rd layer’s\nself attention q_proj module.",
-    "crumbs": [
-      "How To Guides",
-      "Learning Rate Groups"
-    ]
-  },
-  {
-    "objectID": "docs/dataset_loading.html",
-    "href": "docs/dataset_loading.html",
-    "title": "Dataset Loading",
-    "section": "",
-    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
-    "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
-    ]
-  },
-  {
-    "objectID": "docs/dataset_loading.html#overview",
-    "href": "docs/dataset_loading.html#overview",
-    "title": "Dataset Loading",
-    "section": "",
-    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
-    "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
-    ]
-  },
-  {
-    "objectID": "docs/dataset_loading.html#loading-datasets",
-    "href": "docs/dataset_loading.html#loading-datasets",
-    "title": "Dataset Loading",
-    "section": "Loading Datasets",
-    "text": "Loading Datasets\nWe use the datasets library to load datasets and a mix of load_dataset and load_from_disk to load them.\nYou may recognize the similar named configs between load_dataset and the datasets section of the config file.\ndatasets:\n  - path:\n    name:\n    data_files:\n    split:\n    revision:\n    trust_remote_code:\n\n\n\n\n\n\nTip\n\n\n\nDo not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be path and sometimes data_files.\n\n\nThis matches the API of datasets.load_dataset, so if you’re familiar with that, you will feel right at home.\nFor HuggingFace’s guide to load different dataset types, see here.\nFor full details on the config, see config-reference.qmd.\n\n\n\n\n\n\nNote\n\n\n\nYou can set multiple datasets in the config file by more than one entry under datasets.\ndatasets:\n  - path: /path/to/your/dataset\n  - path: /path/to/your/other/dataset\n\n\n\nLocal dataset\n\nFiles\nTo load a JSON file, you would do something like this:\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"json\", data_files=\"data.json\")\nWhich translates to the following config:\ndatasets:\n  - path: data.json\n    ds_type: json\nIn the example above, it can be seen that we can just point the path to the file or directory along with the ds_type to load the dataset.\nThis works for CSV, JSON, Parquet, and Arrow files.\n\n\n\n\n\n\nTip\n\n\n\nIf path points to a file and ds_type is not specified, we will automatically infer the dataset type from the file extension, so you could omit ds_type if you’d like.\n\n\n\n\nDirectory\nIf you’re loading a directory, you can point the path to the directory.\nThen, you have two options:\n\nLoading entire directory\nYou do not need any additional configs.\nWe will attempt to load in the following order:\n- datasets saved with datasets.save_to_disk\n- loading entire directory of files (such as with parquet/arrow files)\ndatasets:\n  - path: /path/to/your/directory\n\n\nLoading specific files in directory\nProvide data_files with a list of files to load.\ndatasets:\n    # single file\n  - path: /path/to/your/directory\n    ds_type: csv\n    data_files: file1.csv\n\n    # multiple files\n  - path: /path/to/your/directory\n    ds_type: json\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n    # multiple files for parquet\n  - path: /path/to/your/directory\n    ds_type: parquet\n    data_files:\n      - file1.parquet\n      - file2.parquet\n\n\n\n\nHuggingFace Hub\nThe method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.\n\n\n\n\n\n\nNote\n\n\n\nIf you’re using a private dataset, you will need to enable the hf_use_auth_token flag in the root-level of the config file.\n\n\n\nFolder uploaded\nThis would mean that the dataset is a single file or file(s) uploaded to the Hub.\ndatasets:\n  - path: org/dataset-name\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n\nHuggingFace Dataset\nThis means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via datasets.push_to_hub.\ndatasets:\n  - path: org/dataset-name\n\n\n\n\n\n\nNote\n\n\n\nThere are some other configs which may be required like name, split, revision, trust_remote_code, etc depending on the dataset.\n\n\n\n\n\nRemote Filesystems\nVia the storage_options config under load_dataset, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.\n\n\n\n\n\n\nWarning\n\n\n\nThis is currently experimental. Please let us know if you run into any issues!\n\n\nThe only difference between the providers is that you need to prepend the path with the respective protocols.\ndatasets:\n    # Single file\n  - path: s3://bucket-name/path/to/your/file.jsonl\n\n    # Directory\n  - path: s3://bucket-name/path/to/your/directory\nFor directory, we load via load_from_disk.\n\nS3\nPrepend the path with s3://.\nThe credentials are pulled in the following order:\n\nAWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_SESSION_TOKEN environment variables\nfrom the ~/.aws/credentials file\nfor nodes on EC2, the IAM metadata provider\n\n\n\n\n\n\n\nNote\n\n\n\nWe assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.\n\n\nOther environment variables that can be set can be found in boto3 docs\n\n\nGCS\nPrepend the path with gs:// or gcs://.\nThe credentials are loaded in the following order:\n\ngcloud credentials\nfor nodes on GCP, the google metadata service\nanonymous access\n\n\n\nAzure\n\nGen 1\nPrepend the path with adl://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_TENANT_ID\nAZURE_STORAGE_CLIENT_ID\nAZURE_STORAGE_CLIENT_SECRET\n\n\n\nGen 2\nPrepend the path with abfs:// or az://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_ACCOUNT_NAME\nAZURE_STORAGE_ACCOUNT_KEY\n\nOther environment variables that can be set can be found in adlfs docs\n\n\n\nOCI\nPrepend the path with oci://.\nIt would attempt to read in the following order:\n\nOCIFS_IAM_TYPE, OCIFS_CONFIG_LOCATION, and OCIFS_CONFIG_PROFILE environment variables\nwhen on OCI resource, resource principal\n\nOther environment variables:\n\nOCI_REGION_METADATA\n\nPlease see the ocifs docs.\n\n\n\nHTTPS\nThe path should start with https://.\ndatasets:\n  - path: https://path/to/your/dataset/file.jsonl\nThis must be publically accessible.",
-    "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
-    ]
-  },
-  {
-    "objectID": "docs/dataset_loading.html#next-steps",
-    "href": "docs/dataset_loading.html#next-steps",
-    "title": "Dataset Loading",
-    "section": "Next steps",
-    "text": "Next steps\nNow that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format dataset formats docs.",
-    "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
-    ]
-  },
-  {
-    "objectID": "docs/getting-started.html",
-    "href": "docs/getting-started.html",
-    "title": "Quickstart",
-    "section": "",
-    "text": "This guide will walk you through your first model fine-tuning project with Axolotl.",
-    "crumbs": [
-      "Getting Started",
-      "Quickstart"
-    ]
-  },
-  {
-    "objectID": "docs/getting-started.html#sec-quick-example",
-    "href": "docs/getting-started.html#sec-quick-example",
-    "title": "Quickstart",
-    "section": "1 Quick Example",
-    "text": "1 Quick Example\nLet’s start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs.\nAssuming axolotl is installed (if not, see our Installation Guide)\n\nDownload example configs:\n\naxolotl fetch examples\n\nRun the training:\n\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Let’s understand what just happened.",
-    "crumbs": [
-      "Getting Started",
-      "Quickstart"
-    ]
-  },
-  {
-    "objectID": "docs/getting-started.html#sec-understanding",
-    "href": "docs/getting-started.html#sec-understanding",
-    "title": "Quickstart",
-    "section": "2 Understanding the Process",
-    "text": "2 Understanding the Process\n\n2.1 The Configuration File\nThe YAML configuration file controls everything about your training. Here’s what (part of) our example config looks like:\nbase_model: NousResearch/Llama-3.2-1B\n\nload_in_8bit: true\nadapter: lora\n\ndatasets:\n  - path: teknium/GPT4-LLM-Cleaned\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.1\noutput_dir: ./outputs/lora-out\n\n\n\n\n\n\nTip\n\n\n\nload_in_8bit: true and adapter: lora enables LoRA adapter finetuning.\n\nTo perform Full finetuning, remove these two lines.\nTo perform QLoRA finetuning, replace with load_in_4bit: true and adapter: qlora.\n\n\n\nSee our config options for more details.\n\n\n2.2 Training\nWhen you run axolotl train, Axolotl:\n\nDownloads the base model\n(If specified) applies QLoRA/LoRA adapter layers\nLoads and processes the dataset\nRuns the training loop\nSaves the trained model and / or LoRA weights",
-    "crumbs": [
-      "Getting Started",
-      "Quickstart"
-    ]
-  },
-  {
-    "objectID": "docs/getting-started.html#sec-custom",
-    "href": "docs/getting-started.html#sec-custom",
-    "title": "Quickstart",
-    "section": "3 Your First Custom Training",
-    "text": "3 Your First Custom Training\nLet’s modify the example for your own data:\n\nCreate a new config file my_training.yml:\n\nbase_model: NousResearch/Nous-Hermes-llama-1b-v1\n\nload_in_8bit: true\nadapter: lora\n\n# Training settings\nmicro_batch_size: 2\nnum_epochs: 3\nlearning_rate: 0.0003\n\n# Your dataset\ndatasets:\n  - path: my_data.jsonl        # Your local data file\n    type: alpaca               # Or other format\nThis specific config is for LoRA fine-tuning a model with instruction tuning data using\nthe alpaca dataset format, which has the following format:\n{\n    \"instruction\": \"Write a description of alpacas.\",\n    \"input\": \"\",\n    \"output\": \"Alpacas are domesticated South American camelids...\"\n}\nPlease see our Dataset Formats for more dataset formats and how to\nformat them.\n\nPrepare your JSONL data in the specified format (in this case, the expected alpaca\nformat):\n\n{\"instruction\": \"Classify this text\", \"input\": \"I love this!\", \"output\": \"positive\"}\n{\"instruction\": \"Classify this text\", \"input\": \"Not good at all\", \"output\": \"negative\"}\n\nRun the training:\n\naxolotl train my_training.yml",
-    "crumbs": [
-      "Getting Started",
-      "Quickstart"
-    ]
-  },
-  {
-    "objectID": "docs/getting-started.html#sec-common-tasks",
-    "href": "docs/getting-started.html#sec-common-tasks",
-    "title": "Quickstart",
-    "section": "4 Common Tasks",
-    "text": "4 Common Tasks\n\n\n\n\n\n\nTip\n\n\n\nThe same yaml file is used for training, inference, and merging.\n\n\n\n4.1 Testing Your Model\nAfter training, test your model:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nMore details can be found in Inference.\n\n\n4.2 Using a UI\nLaunch a Gradio interface:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\" --gradio\n\n\n4.3 Preprocessing Data\nFor large datasets, preprocess first:\naxolotl preprocess my_training.yml\nPlease make sure to set dataset_prepared_path: in your config to set the path to save the prepared dataset.\nMore details can be found in Dataset Preprocessing.\n\n\n4.4 Merging LoRA weights\nTo merge the LoRA weights back into the base model, run:\naxolotl merge-lora my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nThe merged model will be saved in the {output_dir}/merged directory.\nMore details can be found in Merging LoRA weights.",
-    "crumbs": [
-      "Getting Started",
-      "Quickstart"
-    ]
-  },
-  {
-    "objectID": "docs/getting-started.html#sec-next-steps",
-    "href": "docs/getting-started.html#sec-next-steps",
-    "title": "Quickstart",
-    "section": "5 Next Steps",
-    "text": "5 Next Steps\nNow that you have the basics, you might want to:\n\nTry different model architectures\nExperiment with hyperparameters\nUse more advanced training methods\nScale up to larger models\n\nCheck our other guides for details on these topics:\n\nConfiguration Guide - Full configuration options\nDataset Loading - Loading datasets from various sources\nDataset Formats - Working with different data formats\nMulti-GPU Training\nMulti-Node Training",
-    "crumbs": [
-      "Getting Started",
-      "Quickstart"
-    ]
-  },
-  {
-    "objectID": "docs/lora_optims.html",
-    "href": "docs/lora_optims.html",
-    "title": "LoRA Optimizations",
-    "section": "",
-    "text": "Inspired by Unsloth, we’ve implemented two\noptimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU\n(in the DDP and DeepSpeed settings) training. These include (1) SwiGLU and GEGLU activation function\nTriton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was\nto leverage operator fusion and tensor re-use in order to improve speed and reduce\nmemory usage during the forward and backward passes of these calculations.\nWe currently support several common model architectures, including (but not limited to):",
-    "crumbs": [
-      "How To Guides",
-      "LoRA Optimizations"
-    ]
-  },
-  {
-    "objectID": "docs/lora_optims.html#usage",
-    "href": "docs/lora_optims.html#usage",
-    "title": "LoRA Optimizations",
-    "section": "Usage",
-    "text": "Usage\nThese optimizations can be enabled in your Axolotl config YAML file. The\nlora_mlp_kernel option enables the optimized MLP path, while lora_qkv_kernel and\nlora_o_kernel enable the fused query-key-value projection and optimized output\nprojection, respectively.\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n\n\n\n\n\nNote\n\n\n\nCurrently, LoRA kernels are not supported for RLHF training, only SFT.",
-    "crumbs": [
-      "How To Guides",
-      "LoRA Optimizations"
-    ]
-  },
-  {
-    "objectID": "docs/lora_optims.html#requirements",
-    "href": "docs/lora_optims.html#requirements",
-    "title": "LoRA Optimizations",
-    "section": "Requirements",
-    "text": "Requirements\n\nOne or more NVIDIA or AMD GPUs (in order to use the Triton kernels)\n\nNote: Set TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 to enable memory-efficient attention on AMD GPUs\n\nTargeted LoRA adapters cannot use Dropout\n\nThis may limit model expressivity / cause overfitting\n\nTargeted LoRA adapters cannot have bias terms\n\nThis may limit model expressivity\n\n\nModels with pre-existing LoRA adapters that use Dropout or have bias terms may need to\nbe re-finetuned without these features in order to be useful.",
-    "crumbs": [
-      "How To Guides",
-      "LoRA Optimizations"
-    ]
-  },
-  {
-    "objectID": "docs/lora_optims.html#implementation-details",
-    "href": "docs/lora_optims.html#implementation-details",
-    "title": "LoRA Optimizations",
-    "section": "Implementation details",
-    "text": "Implementation details\n\nCustom autograd functions\nThe LoRA MLP autograd function optimizes the entire MLP computation path. It fuses the\nLoRA and base weight computations together and provides a single, efficient backward\npass for the entire MLP block.\nFor attention components, similar optimizations are provided through a function that\nhandles the query, key, and value projections, and a function that handles the output\nprojection. They are designed to work with the existing transformers attention\nimplementation via some monkey-patching logic.\n\n\nTriton kernels\nTwo activation functions (SwiGLU and GeGLU) are implemented with Triton kernels for\nimproved speed and memory performance. These kernels handle both the forward and\nbackward passes.\n\n\nIntegration\nThe custom autograd functions and Triton kernels are designed to work together. The\nautograd function manages the high-level computation flow and gradient tracking, while\ncalling the Triton kernels for the activation function computation. During the backward\npass, the kernel computes both the activation output and the required gradients, which\nthe autograd function then uses to compute the final gradients for the entire\ncomputation path.",
-    "crumbs": [
-      "How To Guides",
-      "LoRA Optimizations"
-    ]
-  },
-  {
-    "objectID": "docs/lora_optims.html#future-work",
-    "href": "docs/lora_optims.html#future-work",
-    "title": "LoRA Optimizations",
-    "section": "Future Work",
-    "text": "Future Work\n\nSupport for additional model architectures\nSupport for the FSDP setting\nSupport for dropout and bias\nAdditional operator fusions",
-    "crumbs": [
-      "How To Guides",
-      "LoRA Optimizations"
-    ]
-  },
-  {
-    "objectID": "docs/multi-node.html",
-    "href": "docs/multi-node.html",
-    "title": "Multi Node",
-    "section": "",
-    "text": "The below are three ways to train multi-node in Axolotl.",
-    "crumbs": [
-      "Deployments",
-      "Multi Node"
-    ]
-  },
-  {
-    "objectID": "docs/multi-node.html#accelerate",
-    "href": "docs/multi-node.html#accelerate",
-    "title": "Multi Node",
-    "section": "Accelerate",
-    "text": "Accelerate\nYou will need to create a configuration for accelerate, either by using accelerate config and follow the instructions or you can use one of the preset below:\n~/.cache/huggingface/accelerate/default_config.yaml\ncompute_environment: LOCAL_MACHINE\ndebug: false\ndistributed_type: FSDP\ndowncast_bf16: 'no'\nmachine_rank: 0 # Set to 0 for the main machine, increment by one for other machines\nmain_process_ip: 10.0.0.4 # Set to main machine's IP\nmain_process_port: 5000\nmain_training_function: main\nmixed_precision: bf16\nnum_machines: 2 # Change to the number of machines\nnum_processes: 4 # That's the total number of GPUs, (for example: if you have 2 machines with 4 GPU, put 8)\nrdzv_backend: static\nsame_network: true\ntpu_env: []\ntpu_use_cluster: false\ntpu_use_sudo: false\nuse_cpu: false\nConfigure your model to use FSDP in the Axolotl yaml. For example:\nfsdp_version: 2\nfsdp_config:\n  offload_params: true\n  state_dict_type: FULL_STATE_DICT\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: LlamaDecoderLayer\n  reshard_after_forward: true\nAll you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.",
-    "crumbs": [
-      "Deployments",
-      "Multi Node"
-    ]
-  },
-  {
-    "objectID": "docs/multi-node.html#raytrain",
-    "href": "docs/multi-node.html#raytrain",
-    "title": "Multi Node",
-    "section": "Raytrain",
-    "text": "Raytrain\nPlease see ray train doc here.",
-    "crumbs": [
-      "Deployments",
-      "Multi Node"
-    ]
-  },
-  {
-    "objectID": "docs/multi-node.html#torchrun",
-    "href": "docs/multi-node.html#torchrun",
-    "title": "Multi Node",
-    "section": "Torchrun",
-    "text": "Torchrun\nIf you are using Infiniband, we recommend torchrun to utilize the full bandwidth.\nSet the following env (change buffersize/socketname depending on your system):\nexport NCCL_IB_DISABLE=0\nexport NCCL_SOCKET_IFNAME=\"eth0,en,eth,em,bond\"\nexport NCCL_BUFFSIZE=2097152\nRun the following on each node:\n\nOption 1: New Axolotl CLI with launcher args (Recommended)\naxolotl train config.yaml --launcher torchrun -- --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint \"$head_node_ip:$head_node_port\"\n\n\nOption 2: Direct torchrun (Legacy)\ntorchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint \"$head_node_ip:$head_node_port\" -m axolotl.cli.train config.yaml\nPlease make sure to substitute the placeholder variables:\n\nnum_nodes: Number of nodes (containing GPUs)\ngpu_per_node: Number of gpus per node\nhead_node_ip: IP of the head node (make sure other machines can connect to this)\nhead_node_port: Port of the head node (make sure other machines can connect to this. Default 29400)\nrdzv_id: A unique job ID that is used by the job across nodes.\n\nThe new CLI approach (Option 1) is recommended as it provides consistent argument handling and works seamlessly with other Axolotl CLI features.\nMore info on the available configs can be found on the Pytorch docs here",
-    "crumbs": [
-      "Deployments",
-      "Multi Node"
-    ]
-  },
-  {
-    "objectID": "docs/fsdp_qlora.html",
-    "href": "docs/fsdp_qlora.html",
-    "title": "FDSP + QLoRA",
-    "section": "",
-    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
+    "text": "# FSDP config. See https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp\nfsdp_version: 2\nfsdp_config:\n  # ...\n\n# The number of GPUs to shard the model parameters across (FSDP dimension).\ndp_shard_size: 4\n\n# The number of times to replicate the sharded model (DDP dimension).\ndp_replicate_size: 2\n\n# Number of GPUs for Tensor Parallelism.\ntensor_parallel_size: 1  # (default is 1, no TP)\n\n# Number of GPUs for Context/Sequence Parallelism.\ncontext_parallel_size: 1 # (default is 1, no CP)\nNote: We recommend FSDP. DeepSpeed is only compatible with tensor_parallel_size.",
     "crumbs": [
       "Advanced Features",
-      "FDSP + QLoRA"
+      "N-D Parallelism"
     ]
   },
   {
-    "objectID": "docs/fsdp_qlora.html#background",
-    "href": "docs/fsdp_qlora.html#background",
-    "title": "FDSP + QLoRA",
+    "objectID": "docs/nd_parallelism.html#examples",
+    "href": "docs/nd_parallelism.html#examples",
+    "title": "N-D Parallelism",
     "section": "",
-    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
+    "text": "HSDP on 2 nodes with 4 GPUs each (8 GPUs total):\n\nYou want FSDP within each node and DDP across nodes.\nSet dp_shard_size: 4 and dp_replicate_size: 2.\n\nFSDP + TP on a single 8-GPU node:\n\nYou want to split the model across 4 GPUs using FSDP, and further split each layer across 2 GPUs with TP.\nSet dp_shard_size: 4 and tensor_parallel_size: 2.\n\nFSDP + CP on a single 8-GPU node for long context:\n\nYou want to shard the model across all 8 GPUs and also split the sequence length across all 8 GPUs.\nSet dp_shard_size: 8 and context_parallel_size: 8. Note: this means the data parallel group and context parallel group are the same. A more common setup might be to shard across a smaller group.",
     "crumbs": [
       "Advanced Features",
-      "FDSP + QLoRA"
+      "N-D Parallelism"
     ]
   },
   {
-    "objectID": "docs/fsdp_qlora.html#usage",
-    "href": "docs/fsdp_qlora.html#usage",
-    "title": "FDSP + QLoRA",
-    "section": "Usage",
-    "text": "Usage\nTo enable QLoRA with FSDP, you need to perform the following steps:\n\n![Tip]\nSee the example config file in addition to reading these instructions.\n\n\nSet adapter: qlora in your axolotl config file.\nEnable FSDP in your axolotl config, as described here.\nUse one of the supported model types: llama, mistral or mixtral.",
-    "crumbs": [
-      "Advanced Features",
-      "FDSP + QLoRA"
-    ]
-  },
-  {
-    "objectID": "docs/fsdp_qlora.html#example-config",
-    "href": "docs/fsdp_qlora.html#example-config",
-    "title": "FDSP + QLoRA",
-    "section": "Example Config",
-    "text": "Example Config\nexamples/llama-2/qlora-fsdp.yml contains an example of how to enable QLoRA + FSDP in axolotl.",
-    "crumbs": [
-      "Advanced Features",
-      "FDSP + QLoRA"
-    ]
-  },
-  {
-    "objectID": "docs/fsdp_qlora.html#references",
-    "href": "docs/fsdp_qlora.html#references",
-    "title": "FDSP + QLoRA",
-    "section": "References",
-    "text": "References\n\nPR #1378 enabling QLoRA in FSDP in Axolotl.\nBlog Post from the Answer.AI team describing the work that enabled QLoRA in FSDP.\nRelated HuggingFace PRs Enabling FDSP + QLoRA:\n\nAccelerate PR#2544\nTransformers PR#29587\nTRL PR#1416\nPEFT PR#1550",
-    "crumbs": [
-      "Advanced Features",
-      "FDSP + QLoRA"
-    ]
-  },
-  {
-    "objectID": "docs/fsdp_qlora.html#footnotes",
-    "href": "docs/fsdp_qlora.html#footnotes",
-    "title": "FDSP + QLoRA",
-    "section": "Footnotes",
-    "text": "Footnotes\n\n\nThis was enabled by this work from the Answer.AI team.↩︎",
-    "crumbs": [
-      "Advanced Features",
-      "FDSP + QLoRA"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html",
-    "href": "docs/inference.html",
-    "title": "Inference and Merging",
+    "objectID": "docs/nd_parallelism.html#support-matrix",
+    "href": "docs/nd_parallelism.html#support-matrix",
+    "title": "N-D Parallelism",
     "section": "",
-    "text": "This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.",
+    "text": "This matrix describes how different parallelism methods can be combined in Axolotl.\n\n\n\n\n\n\n\n\n\n\n\nCombination\ndp_replicate_size\ndp_shard_size\ntp_size\ncp_size\nStatus & Notes\n\n\n\n\nFSDP (ZeRO-3)\n1\n&gt;1\n1\n1\n✅ Fully supported. Shards model across all GPUs.\n\n\nHSDP\n&gt;1\n&gt;1\n1\n1\n✅ Fully supported. FSDP intra-node, DDP inter-node.\n\n\nFSDP + TP\n1\n&gt;1\n&gt;1\n1\n✅ 2D Parallelism. Shards the model across a dp_shard group, and TP-splits layers within the tp group.\n\n\nHSDP + TP\n&gt;1\n&gt;1\n&gt;1\n1\n✅ 3D Parallelism. A powerful but complex combination.\n\n\nFSDP + CP\n1\n&gt;1\n1\n&gt;1\n✅ 2D Parallelism. Combines FSDP with context parallelism.\n\n\nFSDP + TP + CP\n1\n&gt;1\n&gt;1\n&gt;1\n✅ 3D Parallelism. Another advanced combination.\n\n\nDDP + TP/CP\n&gt;1\n1\n&gt;1\n&gt;1\n❌ Not Supported. The ParallelismConfig explicitly prevents this, as composing pure DDP with TP/CP without FSDP is inefficient and complex. You should use FSDP instead (dp_shard_size &gt; 1).\n\n\nJust TP / CP\n1\n1\n&gt;1\n&gt;1\n✅ Supported. Useful for inference or when the model fits on one GPU but context is too long.\n\n\n\n\ntp_size refers to tensor_parallel_size\ncp_size refers to context_parallel_size",
     "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-quickstart",
-    "href": "docs/inference.html#sec-quickstart",
-    "title": "Inference and Merging",
-    "section": "1 Quick Start",
-    "text": "1 Quick Start\n\n\n\n\n\n\nTip\n\n\n\nUse the same config used for training on inference/merging.\n\n\n\n1.1 Basic Inference\n\nLoRA ModelsFull Fine-tuned Models\n\n\naxolotl inference your_config.yml --lora-model-dir=\"./lora-output-dir\"\n\n\naxolotl inference your_config.yml --base-model=\"./completed-model\"",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-advanced",
-    "href": "docs/inference.html#sec-advanced",
-    "title": "Inference and Merging",
-    "section": "2 Advanced Usage",
-    "text": "2 Advanced Usage\n\n2.1 Gradio Interface\nLaunch an interactive web interface:\naxolotl inference your_config.yml --gradio\n\n\n2.2 File-based Prompts\nProcess prompts from a text file:\ncat /tmp/prompt.txt | axolotl inference your_config.yml \\\n  --base-model=\"./completed-model\" --prompter=None\n\n\n2.3 Memory Optimization\nFor large models or limited memory:\naxolotl inference your_config.yml --load-in-8bit=True",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-merging",
-    "href": "docs/inference.html#sec-merging",
-    "title": "Inference and Merging",
-    "section": "3 Merging LoRA Weights",
-    "text": "3 Merging LoRA Weights\nMerge LoRA adapters with the base model:\naxolotl merge-lora your_config.yml --lora-model-dir=\"./completed-model\"\n\n3.1 Memory Management for Merging\n\nConfiguration OptionsForce CPU Merging\n\n\ngpu_memory_limit: 20GiB  # Adjust based on your GPU\nlora_on_cpu: true        # Process on CPU if needed\n\n\nCUDA_VISIBLE_DEVICES=\"\" axolotl merge-lora ...",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-tokenization",
-    "href": "docs/inference.html#sec-tokenization",
-    "title": "Inference and Merging",
-    "section": "4 Tokenization",
-    "text": "4 Tokenization\n\n4.1 Common Issues\n\n\n\n\n\n\nWarning\n\n\n\nTokenization mismatches between training and inference are a common source of problems.\n\n\nTo debug:\n\nCheck training tokenization:\n\naxolotl preprocess your_config.yml --debug\n\nVerify inference tokenization by decoding tokens before model input\nCompare token IDs between training and inference\n\n\n\n4.2 Special Tokens\nConfigure special tokens in your YAML:\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\ntokens:\n  - \"&lt;|im_start|&gt;\"\n  - \"&lt;|im_end|&gt;\"",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-troubleshooting",
-    "href": "docs/inference.html#sec-troubleshooting",
-    "title": "Inference and Merging",
-    "section": "5 Troubleshooting",
-    "text": "5 Troubleshooting\n\n5.1 Common Problems\n\nMemory IssuesToken IssuesPerformance Issues\n\n\n\nUse 8-bit loading\nReduce batch sizes\nTry CPU offloading\n\n\n\n\nVerify special tokens\nCheck tokenizer settings\nCompare training and inference preprocessing\n\n\n\n\nVerify model loading\nCheck prompt formatting\nEnsure temperature/sampling settings\n\n\n\n\nFor more details, see our debugging guide.",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
+      "Advanced Features",
+      "N-D Parallelism"
     ]
   },
   {
@@ -2330,6 +1748,643 @@
       "Sequence Parallelism"
     ]
   },
+  {
+    "objectID": "docs/inference.html",
+    "href": "docs/inference.html",
+    "title": "Inference and Merging",
+    "section": "",
+    "text": "This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-quickstart",
+    "href": "docs/inference.html#sec-quickstart",
+    "title": "Inference and Merging",
+    "section": "1 Quick Start",
+    "text": "1 Quick Start\n\n\n\n\n\n\nTip\n\n\n\nUse the same config used for training on inference/merging.\n\n\n\n1.1 Basic Inference\n\nLoRA ModelsFull Fine-tuned Models\n\n\naxolotl inference your_config.yml --lora-model-dir=\"./lora-output-dir\"\n\n\naxolotl inference your_config.yml --base-model=\"./completed-model\"",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-advanced",
+    "href": "docs/inference.html#sec-advanced",
+    "title": "Inference and Merging",
+    "section": "2 Advanced Usage",
+    "text": "2 Advanced Usage\n\n2.1 Gradio Interface\nLaunch an interactive web interface:\naxolotl inference your_config.yml --gradio\n\n\n2.2 File-based Prompts\nProcess prompts from a text file:\ncat /tmp/prompt.txt | axolotl inference your_config.yml \\\n  --base-model=\"./completed-model\" --prompter=None\n\n\n2.3 Memory Optimization\nFor large models or limited memory:\naxolotl inference your_config.yml --load-in-8bit=True",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-merging",
+    "href": "docs/inference.html#sec-merging",
+    "title": "Inference and Merging",
+    "section": "3 Merging LoRA Weights",
+    "text": "3 Merging LoRA Weights\nMerge LoRA adapters with the base model:\naxolotl merge-lora your_config.yml --lora-model-dir=\"./completed-model\"\n\n3.1 Memory Management for Merging\n\nConfiguration OptionsForce CPU Merging\n\n\ngpu_memory_limit: 20GiB  # Adjust based on your GPU\nlora_on_cpu: true        # Process on CPU if needed\n\n\nCUDA_VISIBLE_DEVICES=\"\" axolotl merge-lora ...",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-tokenization",
+    "href": "docs/inference.html#sec-tokenization",
+    "title": "Inference and Merging",
+    "section": "4 Tokenization",
+    "text": "4 Tokenization\n\n4.1 Common Issues\n\n\n\n\n\n\nWarning\n\n\n\nTokenization mismatches between training and inference are a common source of problems.\n\n\nTo debug:\n\nCheck training tokenization:\n\naxolotl preprocess your_config.yml --debug\n\nVerify inference tokenization by decoding tokens before model input\nCompare token IDs between training and inference\n\n\n\n4.2 Special Tokens\nConfigure special tokens in your YAML:\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\ntokens:\n  - \"&lt;|im_start|&gt;\"\n  - \"&lt;|im_end|&gt;\"",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-troubleshooting",
+    "href": "docs/inference.html#sec-troubleshooting",
+    "title": "Inference and Merging",
+    "section": "5 Troubleshooting",
+    "text": "5 Troubleshooting\n\n5.1 Common Problems\n\nMemory IssuesToken IssuesPerformance Issues\n\n\n\nUse 8-bit loading\nReduce batch sizes\nTry CPU offloading\n\n\n\n\nVerify special tokens\nCheck tokenizer settings\nCompare training and inference preprocessing\n\n\n\n\nVerify model loading\nCheck prompt formatting\nEnsure temperature/sampling settings\n\n\n\n\nFor more details, see our debugging guide.",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html",
+    "href": "docs/fsdp_qlora.html",
+    "title": "FDSP + QLoRA",
+    "section": "",
+    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#background",
+    "href": "docs/fsdp_qlora.html#background",
+    "title": "FDSP + QLoRA",
+    "section": "",
+    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#usage",
+    "href": "docs/fsdp_qlora.html#usage",
+    "title": "FDSP + QLoRA",
+    "section": "Usage",
+    "text": "Usage\nTo enable QLoRA with FSDP, you need to perform the following steps:\n\n![Tip]\nSee the example config file in addition to reading these instructions.\n\n\nSet adapter: qlora in your axolotl config file.\nEnable FSDP in your axolotl config, as described here.\nUse one of the supported model types: llama, mistral or mixtral.",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#example-config",
+    "href": "docs/fsdp_qlora.html#example-config",
+    "title": "FDSP + QLoRA",
+    "section": "Example Config",
+    "text": "Example Config\nexamples/llama-2/qlora-fsdp.yml contains an example of how to enable QLoRA + FSDP in axolotl.",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#references",
+    "href": "docs/fsdp_qlora.html#references",
+    "title": "FDSP + QLoRA",
+    "section": "References",
+    "text": "References\n\nPR #1378 enabling QLoRA in FSDP in Axolotl.\nBlog Post from the Answer.AI team describing the work that enabled QLoRA in FSDP.\nRelated HuggingFace PRs Enabling FDSP + QLoRA:\n\nAccelerate PR#2544\nTransformers PR#29587\nTRL PR#1416\nPEFT PR#1550",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#footnotes",
+    "href": "docs/fsdp_qlora.html#footnotes",
+    "title": "FDSP + QLoRA",
+    "section": "Footnotes",
+    "text": "Footnotes\n\n\nThis was enabled by this work from the Answer.AI team.↩︎",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/multi-node.html",
+    "href": "docs/multi-node.html",
+    "title": "Multi Node",
+    "section": "",
+    "text": "The below are three ways to train multi-node in Axolotl.",
+    "crumbs": [
+      "Deployments",
+      "Multi Node"
+    ]
+  },
+  {
+    "objectID": "docs/multi-node.html#accelerate",
+    "href": "docs/multi-node.html#accelerate",
+    "title": "Multi Node",
+    "section": "Accelerate",
+    "text": "Accelerate\nYou will need to create a configuration for accelerate, either by using accelerate config and follow the instructions or you can use one of the preset below:\n~/.cache/huggingface/accelerate/default_config.yaml\ncompute_environment: LOCAL_MACHINE\ndebug: false\ndistributed_type: FSDP\ndowncast_bf16: 'no'\nmachine_rank: 0 # Set to 0 for the main machine, increment by one for other machines\nmain_process_ip: 10.0.0.4 # Set to main machine's IP\nmain_process_port: 5000\nmain_training_function: main\nmixed_precision: bf16\nnum_machines: 2 # Change to the number of machines\nnum_processes: 4 # That's the total number of GPUs, (for example: if you have 2 machines with 4 GPU, put 8)\nrdzv_backend: static\nsame_network: true\ntpu_env: []\ntpu_use_cluster: false\ntpu_use_sudo: false\nuse_cpu: false\nConfigure your model to use FSDP in the Axolotl yaml. For example:\nfsdp_version: 2\nfsdp_config:\n  offload_params: true\n  state_dict_type: FULL_STATE_DICT\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: LlamaDecoderLayer\n  reshard_after_forward: true\nAll you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.",
+    "crumbs": [
+      "Deployments",
+      "Multi Node"
+    ]
+  },
+  {
+    "objectID": "docs/multi-node.html#raytrain",
+    "href": "docs/multi-node.html#raytrain",
+    "title": "Multi Node",
+    "section": "Raytrain",
+    "text": "Raytrain\nPlease see ray train doc here.",
+    "crumbs": [
+      "Deployments",
+      "Multi Node"
+    ]
+  },
+  {
+    "objectID": "docs/multi-node.html#torchrun",
+    "href": "docs/multi-node.html#torchrun",
+    "title": "Multi Node",
+    "section": "Torchrun",
+    "text": "Torchrun\nIf you are using Infiniband, we recommend torchrun to utilize the full bandwidth.\nSet the following env (change buffersize/socketname depending on your system):\nexport NCCL_IB_DISABLE=0\nexport NCCL_SOCKET_IFNAME=\"eth0,en,eth,em,bond\"\nexport NCCL_BUFFSIZE=2097152\nRun the following on each node:\n\nOption 1: New Axolotl CLI with launcher args (Recommended)\naxolotl train config.yaml --launcher torchrun -- --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint \"$head_node_ip:$head_node_port\"\n\n\nOption 2: Direct torchrun (Legacy)\ntorchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint \"$head_node_ip:$head_node_port\" -m axolotl.cli.train config.yaml\nPlease make sure to substitute the placeholder variables:\n\nnum_nodes: Number of nodes (containing GPUs)\ngpu_per_node: Number of gpus per node\nhead_node_ip: IP of the head node (make sure other machines can connect to this)\nhead_node_port: Port of the head node (make sure other machines can connect to this. Default 29400)\nrdzv_id: A unique job ID that is used by the job across nodes.\n\nThe new CLI approach (Option 1) is recommended as it provides consistent argument handling and works seamlessly with other Axolotl CLI features.\nMore info on the available configs can be found on the Pytorch docs here",
+    "crumbs": [
+      "Deployments",
+      "Multi Node"
+    ]
+  },
+  {
+    "objectID": "docs/lora_optims.html",
+    "href": "docs/lora_optims.html",
+    "title": "LoRA Optimizations",
+    "section": "",
+    "text": "Inspired by Unsloth, we’ve implemented two\noptimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU\n(in the DDP and DeepSpeed settings) training. These include (1) SwiGLU and GEGLU activation function\nTriton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was\nto leverage operator fusion and tensor re-use in order to improve speed and reduce\nmemory usage during the forward and backward passes of these calculations.\nWe currently support several common model architectures, including (but not limited to):",
+    "crumbs": [
+      "How To Guides",
+      "LoRA Optimizations"
+    ]
+  },
+  {
+    "objectID": "docs/lora_optims.html#usage",
+    "href": "docs/lora_optims.html#usage",
+    "title": "LoRA Optimizations",
+    "section": "Usage",
+    "text": "Usage\nThese optimizations can be enabled in your Axolotl config YAML file. The\nlora_mlp_kernel option enables the optimized MLP path, while lora_qkv_kernel and\nlora_o_kernel enable the fused query-key-value projection and optimized output\nprojection, respectively.\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n\n\n\n\n\nNote\n\n\n\nCurrently, LoRA kernels are not supported for RLHF training, only SFT.",
+    "crumbs": [
+      "How To Guides",
+      "LoRA Optimizations"
+    ]
+  },
+  {
+    "objectID": "docs/lora_optims.html#requirements",
+    "href": "docs/lora_optims.html#requirements",
+    "title": "LoRA Optimizations",
+    "section": "Requirements",
+    "text": "Requirements\n\nOne or more NVIDIA or AMD GPUs (in order to use the Triton kernels)\n\nNote: Set TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 to enable memory-efficient attention on AMD GPUs\n\nTargeted LoRA adapters cannot use Dropout\n\nThis may limit model expressivity / cause overfitting\n\nTargeted LoRA adapters cannot have bias terms\n\nThis may limit model expressivity\n\n\nModels with pre-existing LoRA adapters that use Dropout or have bias terms may need to\nbe re-finetuned without these features in order to be useful.",
+    "crumbs": [
+      "How To Guides",
+      "LoRA Optimizations"
+    ]
+  },
+  {
+    "objectID": "docs/lora_optims.html#implementation-details",
+    "href": "docs/lora_optims.html#implementation-details",
+    "title": "LoRA Optimizations",
+    "section": "Implementation details",
+    "text": "Implementation details\n\nCustom autograd functions\nThe LoRA MLP autograd function optimizes the entire MLP computation path. It fuses the\nLoRA and base weight computations together and provides a single, efficient backward\npass for the entire MLP block.\nFor attention components, similar optimizations are provided through a function that\nhandles the query, key, and value projections, and a function that handles the output\nprojection. They are designed to work with the existing transformers attention\nimplementation via some monkey-patching logic.\n\n\nTriton kernels\nTwo activation functions (SwiGLU and GeGLU) are implemented with Triton kernels for\nimproved speed and memory performance. These kernels handle both the forward and\nbackward passes.\n\n\nIntegration\nThe custom autograd functions and Triton kernels are designed to work together. The\nautograd function manages the high-level computation flow and gradient tracking, while\ncalling the Triton kernels for the activation function computation. During the backward\npass, the kernel computes both the activation output and the required gradients, which\nthe autograd function then uses to compute the final gradients for the entire\ncomputation path.",
+    "crumbs": [
+      "How To Guides",
+      "LoRA Optimizations"
+    ]
+  },
+  {
+    "objectID": "docs/lora_optims.html#future-work",
+    "href": "docs/lora_optims.html#future-work",
+    "title": "LoRA Optimizations",
+    "section": "Future Work",
+    "text": "Future Work\n\nSupport for additional model architectures\nSupport for the FSDP setting\nSupport for dropout and bias\nAdditional operator fusions",
+    "crumbs": [
+      "How To Guides",
+      "LoRA Optimizations"
+    ]
+  },
+  {
+    "objectID": "docs/getting-started.html",
+    "href": "docs/getting-started.html",
+    "title": "Quickstart",
+    "section": "",
+    "text": "This guide will walk you through your first model fine-tuning project with Axolotl.",
+    "crumbs": [
+      "Getting Started",
+      "Quickstart"
+    ]
+  },
+  {
+    "objectID": "docs/getting-started.html#sec-quick-example",
+    "href": "docs/getting-started.html#sec-quick-example",
+    "title": "Quickstart",
+    "section": "1 Quick Example",
+    "text": "1 Quick Example\nLet’s start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs.\nAssuming axolotl is installed (if not, see our Installation Guide)\n\nDownload example configs:\n\naxolotl fetch examples\n\nRun the training:\n\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Let’s understand what just happened.",
+    "crumbs": [
+      "Getting Started",
+      "Quickstart"
+    ]
+  },
+  {
+    "objectID": "docs/getting-started.html#sec-understanding",
+    "href": "docs/getting-started.html#sec-understanding",
+    "title": "Quickstart",
+    "section": "2 Understanding the Process",
+    "text": "2 Understanding the Process\n\n2.1 The Configuration File\nThe YAML configuration file controls everything about your training. Here’s what (part of) our example config looks like:\nbase_model: NousResearch/Llama-3.2-1B\n\nload_in_8bit: true\nadapter: lora\n\ndatasets:\n  - path: teknium/GPT4-LLM-Cleaned\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.1\noutput_dir: ./outputs/lora-out\n\n\n\n\n\n\nTip\n\n\n\nload_in_8bit: true and adapter: lora enables LoRA adapter finetuning.\n\nTo perform Full finetuning, remove these two lines.\nTo perform QLoRA finetuning, replace with load_in_4bit: true and adapter: qlora.\n\n\n\nSee our config options for more details.\n\n\n2.2 Training\nWhen you run axolotl train, Axolotl:\n\nDownloads the base model\n(If specified) applies QLoRA/LoRA adapter layers\nLoads and processes the dataset\nRuns the training loop\nSaves the trained model and / or LoRA weights",
+    "crumbs": [
+      "Getting Started",
+      "Quickstart"
+    ]
+  },
+  {
+    "objectID": "docs/getting-started.html#sec-custom",
+    "href": "docs/getting-started.html#sec-custom",
+    "title": "Quickstart",
+    "section": "3 Your First Custom Training",
+    "text": "3 Your First Custom Training\nLet’s modify the example for your own data:\n\nCreate a new config file my_training.yml:\n\nbase_model: NousResearch/Nous-Hermes-llama-1b-v1\n\nload_in_8bit: true\nadapter: lora\n\n# Training settings\nmicro_batch_size: 2\nnum_epochs: 3\nlearning_rate: 0.0003\n\n# Your dataset\ndatasets:\n  - path: my_data.jsonl        # Your local data file\n    type: alpaca               # Or other format\nThis specific config is for LoRA fine-tuning a model with instruction tuning data using\nthe alpaca dataset format, which has the following format:\n{\n    \"instruction\": \"Write a description of alpacas.\",\n    \"input\": \"\",\n    \"output\": \"Alpacas are domesticated South American camelids...\"\n}\nPlease see our Dataset Formats for more dataset formats and how to\nformat them.\n\nPrepare your JSONL data in the specified format (in this case, the expected alpaca\nformat):\n\n{\"instruction\": \"Classify this text\", \"input\": \"I love this!\", \"output\": \"positive\"}\n{\"instruction\": \"Classify this text\", \"input\": \"Not good at all\", \"output\": \"negative\"}\n\nRun the training:\n\naxolotl train my_training.yml",
+    "crumbs": [
+      "Getting Started",
+      "Quickstart"
+    ]
+  },
+  {
+    "objectID": "docs/getting-started.html#sec-common-tasks",
+    "href": "docs/getting-started.html#sec-common-tasks",
+    "title": "Quickstart",
+    "section": "4 Common Tasks",
+    "text": "4 Common Tasks\n\n\n\n\n\n\nTip\n\n\n\nThe same yaml file is used for training, inference, and merging.\n\n\n\n4.1 Testing Your Model\nAfter training, test your model:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nMore details can be found in Inference.\n\n\n4.2 Using a UI\nLaunch a Gradio interface:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\" --gradio\n\n\n4.3 Preprocessing Data\nFor large datasets, preprocess first:\naxolotl preprocess my_training.yml\nPlease make sure to set dataset_prepared_path: in your config to set the path to save the prepared dataset.\nMore details can be found in Dataset Preprocessing.\n\n\n4.4 Merging LoRA weights\nTo merge the LoRA weights back into the base model, run:\naxolotl merge-lora my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nThe merged model will be saved in the {output_dir}/merged directory.\nMore details can be found in Merging LoRA weights.",
+    "crumbs": [
+      "Getting Started",
+      "Quickstart"
+    ]
+  },
+  {
+    "objectID": "docs/getting-started.html#sec-next-steps",
+    "href": "docs/getting-started.html#sec-next-steps",
+    "title": "Quickstart",
+    "section": "5 Next Steps",
+    "text": "5 Next Steps\nNow that you have the basics, you might want to:\n\nTry different model architectures\nExperiment with hyperparameters\nUse more advanced training methods\nScale up to larger models\n\nCheck our other guides for details on these topics:\n\nConfiguration Guide - Full configuration options\nDataset Loading - Loading datasets from various sources\nDataset Formats - Working with different data formats\nMulti-GPU Training\nMulti-Node Training",
+    "crumbs": [
+      "Getting Started",
+      "Quickstart"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_loading.html",
+    "href": "docs/dataset_loading.html",
+    "title": "Dataset Loading",
+    "section": "",
+    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
+    "crumbs": [
+      "How To Guides",
+      "Dataset Loading"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_loading.html#overview",
+    "href": "docs/dataset_loading.html#overview",
+    "title": "Dataset Loading",
+    "section": "",
+    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
+    "crumbs": [
+      "How To Guides",
+      "Dataset Loading"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_loading.html#loading-datasets",
+    "href": "docs/dataset_loading.html#loading-datasets",
+    "title": "Dataset Loading",
+    "section": "Loading Datasets",
+    "text": "Loading Datasets\nWe use the datasets library to load datasets and a mix of load_dataset and load_from_disk to load them.\nYou may recognize the similar named configs between load_dataset and the datasets section of the config file.\ndatasets:\n  - path:\n    name:\n    data_files:\n    split:\n    revision:\n    trust_remote_code:\n\n\n\n\n\n\nTip\n\n\n\nDo not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be path and sometimes data_files.\n\n\nThis matches the API of datasets.load_dataset, so if you’re familiar with that, you will feel right at home.\nFor HuggingFace’s guide to load different dataset types, see here.\nFor full details on the config, see config-reference.qmd.\n\n\n\n\n\n\nNote\n\n\n\nYou can set multiple datasets in the config file by more than one entry under datasets.\ndatasets:\n  - path: /path/to/your/dataset\n  - path: /path/to/your/other/dataset\n\n\n\nLocal dataset\n\nFiles\nTo load a JSON file, you would do something like this:\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"json\", data_files=\"data.json\")\nWhich translates to the following config:\ndatasets:\n  - path: data.json\n    ds_type: json\nIn the example above, it can be seen that we can just point the path to the file or directory along with the ds_type to load the dataset.\nThis works for CSV, JSON, Parquet, and Arrow files.\n\n\n\n\n\n\nTip\n\n\n\nIf path points to a file and ds_type is not specified, we will automatically infer the dataset type from the file extension, so you could omit ds_type if you’d like.\n\n\n\n\nDirectory\nIf you’re loading a directory, you can point the path to the directory.\nThen, you have two options:\n\nLoading entire directory\nYou do not need any additional configs.\nWe will attempt to load in the following order:\n- datasets saved with datasets.save_to_disk\n- loading entire directory of files (such as with parquet/arrow files)\ndatasets:\n  - path: /path/to/your/directory\n\n\nLoading specific files in directory\nProvide data_files with a list of files to load.\ndatasets:\n    # single file\n  - path: /path/to/your/directory\n    ds_type: csv\n    data_files: file1.csv\n\n    # multiple files\n  - path: /path/to/your/directory\n    ds_type: json\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n    # multiple files for parquet\n  - path: /path/to/your/directory\n    ds_type: parquet\n    data_files:\n      - file1.parquet\n      - file2.parquet\n\n\n\n\nHuggingFace Hub\nThe method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.\n\n\n\n\n\n\nNote\n\n\n\nIf you’re using a private dataset, you will need to enable the hf_use_auth_token flag in the root-level of the config file.\n\n\n\nFolder uploaded\nThis would mean that the dataset is a single file or file(s) uploaded to the Hub.\ndatasets:\n  - path: org/dataset-name\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n\nHuggingFace Dataset\nThis means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via datasets.push_to_hub.\ndatasets:\n  - path: org/dataset-name\n\n\n\n\n\n\nNote\n\n\n\nThere are some other configs which may be required like name, split, revision, trust_remote_code, etc depending on the dataset.\n\n\n\n\n\nRemote Filesystems\nVia the storage_options config under load_dataset, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.\n\n\n\n\n\n\nWarning\n\n\n\nThis is currently experimental. Please let us know if you run into any issues!\n\n\nThe only difference between the providers is that you need to prepend the path with the respective protocols.\ndatasets:\n    # Single file\n  - path: s3://bucket-name/path/to/your/file.jsonl\n\n    # Directory\n  - path: s3://bucket-name/path/to/your/directory\nFor directory, we load via load_from_disk.\n\nS3\nPrepend the path with s3://.\nThe credentials are pulled in the following order:\n\nAWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_SESSION_TOKEN environment variables\nfrom the ~/.aws/credentials file\nfor nodes on EC2, the IAM metadata provider\n\n\n\n\n\n\n\nNote\n\n\n\nWe assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.\n\n\nOther environment variables that can be set can be found in boto3 docs\n\n\nGCS\nPrepend the path with gs:// or gcs://.\nThe credentials are loaded in the following order:\n\ngcloud credentials\nfor nodes on GCP, the google metadata service\nanonymous access\n\n\n\nAzure\n\nGen 1\nPrepend the path with adl://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_TENANT_ID\nAZURE_STORAGE_CLIENT_ID\nAZURE_STORAGE_CLIENT_SECRET\n\n\n\nGen 2\nPrepend the path with abfs:// or az://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_ACCOUNT_NAME\nAZURE_STORAGE_ACCOUNT_KEY\n\nOther environment variables that can be set can be found in adlfs docs\n\n\n\nOCI\nPrepend the path with oci://.\nIt would attempt to read in the following order:\n\nOCIFS_IAM_TYPE, OCIFS_CONFIG_LOCATION, and OCIFS_CONFIG_PROFILE environment variables\nwhen on OCI resource, resource principal\n\nOther environment variables:\n\nOCI_REGION_METADATA\n\nPlease see the ocifs docs.\n\n\n\nHTTPS\nThe path should start with https://.\ndatasets:\n  - path: https://path/to/your/dataset/file.jsonl\nThis must be publically accessible.",
+    "crumbs": [
+      "How To Guides",
+      "Dataset Loading"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_loading.html#next-steps",
+    "href": "docs/dataset_loading.html#next-steps",
+    "title": "Dataset Loading",
+    "section": "Next steps",
+    "text": "Next steps\nNow that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format dataset formats docs.",
+    "crumbs": [
+      "How To Guides",
+      "Dataset Loading"
+    ]
+  },
+  {
+    "objectID": "docs/lr_groups.html",
+    "href": "docs/lr_groups.html",
+    "title": "Learning Rate Groups",
+    "section": "",
+    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
+    "crumbs": [
+      "How To Guides",
+      "Learning Rate Groups"
+    ]
+  },
+  {
+    "objectID": "docs/lr_groups.html#background",
+    "href": "docs/lr_groups.html#background",
+    "title": "Learning Rate Groups",
+    "section": "",
+    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
+    "crumbs": [
+      "How To Guides",
+      "Learning Rate Groups"
+    ]
+  },
+  {
+    "objectID": "docs/lr_groups.html#example",
+    "href": "docs/lr_groups.html#example",
+    "title": "Learning Rate Groups",
+    "section": "Example",
+    "text": "Example\nlr_groups:\n  - name: o_proj\n    modules:\n      - self_attn.o_proj.weight\n    lr: 1e-6\n  - name: q_proj\n    modules:\n      - model.layers.2.self_attn.q_proj.weight\n    lr: 1e-5\n\nlearning_rate: 2e-5\nIn this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate\nof 1e-6 for all the self attention o_proj modules across all layers, and a learning are of 1e-5 to the 3rd layer’s\nself attention q_proj module.",
+    "crumbs": [
+      "How To Guides",
+      "Learning Rate Groups"
+    ]
+  },
+  {
+    "objectID": "docs/mac.html",
+    "href": "docs/mac.html",
+    "title": "Mac M-series",
+    "section": "",
+    "text": "Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.\nCurrent support:\n\nSupport for all models\nFull training of models\nLoRA training\nSample packing\nFP16 and BF16 (awaiting AMP support for MPS in Pytorch)\nTri-dao’s flash-attn (until it is supported use spd_attention as an alternative)\nxformers\nbitsandbytes (meaning no 4/8 bits loading and bnb optimizers)\nqlora\nDeepSpeed\n\nUntested:\n\nFSDP",
+    "crumbs": [
+      "Deployments",
+      "Mac M-series"
+    ]
+  },
+  {
+    "objectID": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html",
+    "href": "src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html",
+    "title": "Axolotl",
+    "section": "",
+    "text": "Acknowledgements\nPortions of this Cut Cross Entropy Software may utilize the following copyrighted\nmaterial, the use of which is hereby acknowledged.\n\nPyTorch\nFrom PyTorch:\n\nCopyright (c) 2016-     Facebook, Inc            (Adam Paszke)\nCopyright (c) 2014-     Facebook, Inc            (Soumith Chintala)\nCopyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)\nCopyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)\nCopyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)\nCopyright (c) 2011-2013 NYU                      (Clement Farabet)\nCopyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)\nCopyright (c) 2006      Idiap Research Institute (Samy Bengio)\nCopyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)\n\nFrom Caffe2:\n\nCopyright (c) 2016-present, Facebook Inc. All rights reserved.\n\nAll contributions by Facebook:\nCopyright (c) 2016 Facebook Inc.\n\nAll contributions by Google:\nCopyright (c) 2015 Google Inc.\nAll rights reserved.\n\nAll contributions by Yangqing Jia:\nCopyright (c) 2015 Yangqing Jia\nAll rights reserved.\n\nAll contributions by Kakao Brain:\nCopyright 2019-2020 Kakao Brain\n\nAll contributions by Cruise LLC:\nCopyright (c) 2022 Cruise LLC.\nAll rights reserved.\n\nAll contributions by Arm:\nCopyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates\n\nAll contributions from Caffe:\nCopyright(c) 2013, 2014, 2015, the respective contributors\nAll rights reserved.\n\nAll other contributions:\nCopyright(c) 2015, 2016 the respective contributors\nAll rights reserved.\n\nCaffe2 uses a copyright model similar to Caffe: each contributor holds\ncopyright over their contributions to Caffe2. The project versioning records\nall such contribution and copyright details. If a contributor wants to further\nmark their specific copyright on a particular contribution, they should\nindicate their copyright solely in the commit message of the change when it is\ncommitted.\n\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n1. Redistributions of source code must retain the above copyright\nnotice, this list of conditions and the following disclaimer.\n\n2. Redistributions in binary form must reproduce the above copyright\nnotice, this list of conditions and the following disclaimer in the\ndocumentation and/or other materials provided with the distribution.\n\n3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America\nand IDIAP Research Institute nor the names of its contributors may be\nused to endorse or promote products derived from this software without\nspecific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\nARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE\nLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\nCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\nSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\nINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\nCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\nARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\nPOSSIBILITY OF SUCH DAMAGE.\nTriton\n/*\n* Copyright 2018-2020 Philippe Tillet\n* Copyright 2020-2022 OpenAI\n*\n* Permission is hereby granted, free of charge, to any person obtaining\n* a copy of this software and associated documentation files\n* (the \"Software\"), to deal in the Software without restriction,\n* including without limitation the rights to use, copy, modify, merge,\n* publish, distribute, sublicense, and/or sell copies of the Software,\n* and to permit persons to whom the Software is furnished to do so,\n* subject to the following conditions:\n*\n* The above copyright notice and this permission notice shall be\n* included in all copies or substantial portions of the Software.\n*\n* THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND,\n* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\n* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\n* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\n* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\n* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\n* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n*/\nTransformers\nCopyright 2018- The Hugging Face team. All rights reserved.\n\n                                Apache License\n                        Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\nTERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n1. Definitions.\n\n    \"License\" shall mean the terms and conditions for use, reproduction,\n    and distribution as defined by Sections 1 through 9 of this document.\n\n    \"Licensor\" shall mean the copyright owner or entity authorized by\n    the copyright owner that is granting the License.\n\n    \"Legal Entity\" shall mean the union of the acting entity and all\n    other entities that control, are controlled by, or are under common\n    control with that entity. For the purposes of this definition,\n    \"control\" means (i) the power, direct or indirect, to cause the\n    direction or management of such entity, whether by contract or\n    otherwise, or (ii) ownership of fifty percent (50%) or more of the\n    outstanding shares, or (iii) beneficial ownership of such entity.\n\n    \"You\" (or \"Your\") shall mean an individual or Legal Entity\n    exercising permissions granted by this License.\n\n    \"Source\" form shall mean the preferred form for making modifications,\n    including but not limited to software source code, documentation\n    source, and configuration files.\n\n    \"Object\" form shall mean any form resulting from mechanical\n    transformation or translation of a Source form, including but\n    not limited to compiled object code, generated documentation,\n    and conversions to other media types.\n\n    \"Work\" shall mean the work of authorship, whether in Source or\n    Object form, made available under the License, as indicated by a\n    copyright notice that is included in or attached to the work\n    (an example is provided in the Appendix below).\n\n    \"Derivative Works\" shall mean any work, whether in Source or Object\n    form, that is based on (or derived from) the Work and for which the\n    editorial revisions, annotations, elaborations, or other modifications\n    represent, as a whole, an original work of authorship. For the purposes\n    of this License, Derivative Works shall not include works that remain\n    separable from, or merely link (or bind by name) to the interfaces of,\n    the Work and Derivative Works thereof.\n\n    \"Contribution\" shall mean any work of authorship, including\n    the original version of the Work and any modifications or additions\n    to that Work or Derivative Works thereof, that is intentionally\n    submitted to Licensor for inclusion in the Work by the copyright owner\n    or by an individual or Legal Entity authorized to submit on behalf of\n    the copyright owner. For the purposes of this definition, \"submitted\"\n    means any form of electronic, verbal, or written communication sent\n    to the Licensor or its representatives, including but not limited to\n    communication on electronic mailing lists, source code control systems,\n    and issue tracking systems that are managed by, or on behalf of, the\n    Licensor for the purpose of discussing and improving the Work, but\n    excluding communication that is conspicuously marked or otherwise\n    designated in writing by the copyright owner as \"Not a Contribution.\"\n\n    \"Contributor\" shall mean Licensor and any individual or Legal Entity\n    on behalf of whom a Contribution has been received by Licensor and\n    subsequently incorporated within the Work.\n\n2. Grant of Copyright License. Subject to the terms and conditions of\n    this License, each Contributor hereby grants to You a perpetual,\n    worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n    copyright license to reproduce, prepare Derivative Works of,\n    publicly display, publicly perform, sublicense, and distribute the\n    Work and such Derivative Works in Source or Object form.\n\n3. Grant of Patent License. Subject to the terms and conditions of\n    this License, each Contributor hereby grants to You a perpetual,\n    worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n    (except as stated in this section) patent license to make, have made,\n    use, offer to sell, sell, import, and otherwise transfer the Work,\n    where such license applies only to those patent claims licensable\n    by such Contributor that are necessarily infringed by their\n    Contribution(s) alone or by combination of their Contribution(s)\n    with the Work to which such Contribution(s) was submitted. If You\n    institute patent litigation against any entity (including a\n    cross-claim or counterclaim in a lawsuit) alleging that the Work\n    or a Contribution incorporated within the Work constitutes direct\n    or contributory patent infringement, then any patent licenses\n    granted to You under this License for that Work shall terminate\n    as of the date such litigation is filed.\n\n4. Redistribution. You may reproduce and distribute copies of the\n    Work or Derivative Works thereof in any medium, with or without\n    modifications, and in Source or Object form, provided that You\n    meet the following conditions:\n\n    (a) You must give any other recipients of the Work or\n        Derivative Works a copy of this License; and\n\n    (b) You must cause any modified files to carry prominent notices\n        stating that You changed the files; and\n\n    (c) You must retain, in the Source form of any Derivative Works\n        that You distribute, all copyright, patent, trademark, and\n        attribution notices from the Source form of the Work,\n        excluding those notices that do not pertain to any part of\n        the Derivative Works; and\n\n    (d) If the Work includes a \"NOTICE\" text file as part of its\n        distribution, then any Derivative Works that You distribute must\n        include a readable copy of the attribution notices contained\n        within such NOTICE file, excluding those notices that do not\n        pertain to any part of the Derivative Works, in at least one\n        of the following places: within a NOTICE text file distributed\n        as part of the Derivative Works; within the Source form or\n        documentation, if provided along with the Derivative Works; or,\n        within a display generated by the Derivative Works, if and\n        wherever such third-party notices normally appear. The contents\n        of the NOTICE file are for informational purposes only and\n        do not modify the License. You may add Your own attribution\n        notices within Derivative Works that You distribute, alongside\n        or as an addendum to the NOTICE text from the Work, provided\n        that such additional attribution notices cannot be construed\n        as modifying the License.\n\n    You may add Your own copyright statement to Your modifications and\n    may provide additional or different license terms and conditions\n    for use, reproduction, or distribution of Your modifications, or\n    for any such Derivative Works as a whole, provided Your use,\n    reproduction, and distribution of the Work otherwise complies with\n    the conditions stated in this License.\n\n5. Submission of Contributions. Unless You explicitly state otherwise,\n    any Contribution intentionally submitted for inclusion in the Work\n    by You to the Licensor shall be under the terms and conditions of\n    this License, without any additional terms or conditions.\n    Notwithstanding the above, nothing herein shall supersede or modify\n    the terms of any separate license agreement you may have executed\n    with Licensor regarding such Contributions.\n\n6. Trademarks. This License does not grant permission to use the trade\n    names, trademarks, service marks, or product names of the Licensor,\n    except as required for reasonable and customary use in describing the\n    origin of the Work and reproducing the content of the NOTICE file.\n\n7. Disclaimer of Warranty. Unless required by applicable law or\n    agreed to in writing, Licensor provides the Work (and each\n    Contributor provides its Contributions) on an \"AS IS\" BASIS,\n    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n    implied, including, without limitation, any warranties or conditions\n    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n    PARTICULAR PURPOSE. You are solely responsible for determining the\n    appropriateness of using or redistributing the Work and assume any\n    risks associated with Your exercise of permissions under this License.\n\n8. Limitation of Liability. In no event and under no legal theory,\n    whether in tort (including negligence), contract, or otherwise,\n    unless required by applicable law (such as deliberate and grossly\n    negligent acts) or agreed to in writing, shall any Contributor be\n    liable to You for damages, including any direct, indirect, special,\n    incidental, or consequential damages of any character arising as a\n    result of this License or out of the use or inability to use the\n    Work (including but not limited to damages for loss of goodwill,\n    work stoppage, computer failure or malfunction, or any and all\n    other commercial damages or losses), even if such Contributor\n    has been advised of the possibility of such damages.\n\n9. Accepting Warranty or Additional Liability. While redistributing\n    the Work or Derivative Works thereof, You may choose to offer,\n    and charge a fee for, acceptance of support, warranty, indemnity,\n    or other liability obligations and/or rights consistent with this\n    License. However, in accepting such obligations, You may act only\n    on Your own behalf and on Your sole responsibility, not on behalf\n    of any other Contributor, and only if You agree to indemnify,\n    defend, and hold each Contributor harmless for any liability\n    incurred by, or claims asserted against, such Contributor by reason\n    of your accepting any such warranty or additional liability.\n\nEND OF TERMS AND CONDITIONS\n\nAPPENDIX: How to apply the Apache License to your work.\n\n    To apply the Apache License to your work, attach the following\n    boilerplate notice, with the fields enclosed by brackets \"[]\"\n    replaced with your own identifying information. (Don't include\n    the brackets!)  The text should be enclosed in the appropriate\n    comment syntax for the file format. We also recommend that a\n    file or class name and description of purpose be included on the\n    same \"printed page\" as the copyright notice for easier\n    identification within third-party archives.\n\nCopyright [yyyy] [name of copyright owner]\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n    http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License."
+  },
+  {
+    "objectID": "src/axolotl/integrations/LICENSE.html",
+    "href": "src/axolotl/integrations/LICENSE.html",
+    "title": "Axolotl",
+    "section": "",
+    "text": "AXOLOTL COMMUNITY LICENSE AGREEMENT\nThis Axolotl Community License Agreement (“Agreement”) is entered into by and between Axolotl AI Corp. (“Axolotl”) and\nany individual or entity (“Licensee”) who wishes to use the Software (as defined below) in accordance with the terms\nand conditions set forth in this Agreement.\n\nDefinitions\n1.1 “Licensee” refers to any individual or entity who has obtained a copy of the Software under this Agreement.\n1.2 “Plugin Integration” means independent integration software modules which may or may not be offered by Axolotl,\nwhich may be licensed separately by their respective authors and/or licensors.\n1.3 “Software” refers to the specific sub-directory of the Axolotl, Inc. software located at\nhttps://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations and its subdirectories which\npermits Plugin Integrations to integrate with the Axolotl service.\nGrant of License\n2.1 Axolotl hereby grants Licensee a worldwide, non-exclusive, royalty-free, license to use, copy, modify, merge,\npublish, distribute, sublicense, and/or otherwise exploit the Software, subject to the following conditions:\n- Licensee must comply with all the terms and conditions of this Agreement.\n- Licensee must include the original copyright notice and disclaimer of warranty in all copies or substantial\nportions of the Software.\n2.2 Licensee may use the Software for any lawful purpose, except as restricted in Section 3.\nRestrictions\n3.1 Licensee shall not use the Software for any activity that constitutes a commercial activity of offering for\nfree or for sale any services, platform, or equivalent to third parties for the purposes of allowing such\nthird parties to fine-tune artificial intelligence models.\n3.2 Licensee shall not:\n- Use the Software for any illegal or unauthorized purpose.\n- Reverse engineer, decompile, or disassemble the Software.\n- Remove or modify any copyright, trademark, or other proprietary notices contained in the Software.\n- Use the Software in a way that could damage, disable, overburden, or impair the functionality of the\nSoftware or interfere with any third-party use of the Software.\n3.3 Axolotl reserves the right to restrict certain Plugin Integrations for use with the Software. To the extent Licensee integrates a permitted, applicable Plugin Integration with the Software, Licensee shall comply with any additional terms and conditions imposed by the licensors of such Plugin Integration for use of such Plugin Integrations. Licensee shall contact Axolotl if it has questions about whether its use of the Software falls beyond the scope of this Agreement.\nIntellectual Property Rights\n4.1 Axolotl and its contributors retain all intellectual property rights in and to the Software. Licensee\nacknowledges that this Agreement does not transfer any ownership rights or intellectual property rights to\nLicensee.\nDisclaimer of Warranty\n5.1 THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED\nTO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL\nTHE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF\nCONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\nDEALINGS IN THE SOFTWARE.\nTermination\n6.1 Axolotl may terminate this Agreement at any time if Licensee fails to comply with any of the terms and\nconditions set forth herein. Upon termination, Licensee shall cease all use of the Software and destroy any\ncopies in its possession.\nGoverning Law\n7.1 This Agreement shall be governed by and construed in accordance with the laws of the State of California,\nwithout regards to conflicts of laws provisions thereof.\nEntire Agreement\n8.1 This Agreement constitutes the entire agreement between Axolotl and Licensee with respect to the subject matter\nhereof and supersedes all prior or contemporaneous understandings or agreements between the parties concerning\nthe Software, whether written or oral. Axolotl may update the terms of this Agreement from time to time, and\nLicensee’s continued use of the Software after any such updates shall constitute acceptance of updated terms\non a go-forward basis. Axolotl will use commercially reasonable efforts to provide Licensee notice of any\nmaterial updates. By using the Software, Licensee acknowledges that it has read, understood, and agrees to be\nbound by the terms and conditions of this Agreement.\n\nThis Agreement was last updated on August 23, 2024."
+  },
+  {
+    "objectID": "docs/input_output.html",
+    "href": "docs/input_output.html",
+    "title": "Template-free prompt construction",
+    "section": "",
+    "text": "The documentation moved to here."
+  },
+  {
+    "objectID": "docs/gradient_checkpointing.html",
+    "href": "docs/gradient_checkpointing.html",
+    "title": "Gradient Checkpointing and Activation Offloading",
+    "section": "",
+    "text": "Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning\nmodels by reducing the memory footprint and improving computational efficiency.\n\nEnabling Gradient Checkpointing\ngradient_checkpointing: true\n\n\nEnabling Activation Offloading\ngradient_checkpointing: true  # required for activation offloading\nactivation_offloading: true\nActivation offloading variants:\nThe default activation_offloading: true offloads activations to CPU and uses CUDA streams\nto overlap the communications and computations when offloading.\nThe activation_offloading: legacy naively offloads activations to CPU and without additional optimizations.\nFor resource constrained environments with limited CPU memory, activation_offloading: disk offloads\nactivations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.",
+    "crumbs": [
+      "Advanced Features",
+      "Gradient Checkpointing and Activation Offloading"
+    ]
+  },
+  {
+    "objectID": "docs/qat.html",
+    "href": "docs/qat.html",
+    "title": "Quantization Aware Training (QAT)",
+    "section": "",
+    "text": "Quantization Aware Training (QAT) is a technique for improving the accuracy of models which are quantized\nby applying “fake” quantizations to the model’s weights (and optionally, activations) during training. This fake\nquantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually\nquantized, the accuracy loss is minimized. We use the quantization techniques implemented in torchao to provide\nsupport for QAT and post-training quantization (PTQ) in axolotl.\nWe recommend reviewing the excellent QAT tutorial in the torchtune library,\nand the QAT documentation in the torchao library, for more details.",
+    "crumbs": [
+      "How To Guides",
+      "Quantization Aware Training (QAT)"
+    ]
+  },
+  {
+    "objectID": "docs/qat.html#overview",
+    "href": "docs/qat.html#overview",
+    "title": "Quantization Aware Training (QAT)",
+    "section": "",
+    "text": "Quantization Aware Training (QAT) is a technique for improving the accuracy of models which are quantized\nby applying “fake” quantizations to the model’s weights (and optionally, activations) during training. This fake\nquantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually\nquantized, the accuracy loss is minimized. We use the quantization techniques implemented in torchao to provide\nsupport for QAT and post-training quantization (PTQ) in axolotl.\nWe recommend reviewing the excellent QAT tutorial in the torchtune library,\nand the QAT documentation in the torchao library, for more details.",
+    "crumbs": [
+      "How To Guides",
+      "Quantization Aware Training (QAT)"
+    ]
+  },
+  {
+    "objectID": "docs/qat.html#configuring-qat-in-axolotl",
+    "href": "docs/qat.html#configuring-qat-in-axolotl",
+    "title": "Quantization Aware Training (QAT)",
+    "section": "Configuring QAT in Axolotl",
+    "text": "Configuring QAT in Axolotl\nTo enable QAT in axolotl, add the following to your configuration file:\nqat:\n  activation_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for activation quantization. Valid options are \"int4\" and \"int8\"\n  weight_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for weight quantization. Valid options are \"int4\" and \"int8\"\n  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization\n  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after\nOnce you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the quantize command to do this.",
+    "crumbs": [
+      "How To Guides",
+      "Quantization Aware Training (QAT)"
+    ]
+  },
+  {
+    "objectID": "docs/faq.html",
+    "href": "docs/faq.html",
+    "title": "FAQ",
+    "section": "",
+    "text": "General\nQ: The trainer stopped and hasn’t progressed in several minutes.\n\nA: Usually an issue with the GPUs communicating with each other. See the NCCL doc\n\nQ: exitcode: -9\n\nA: This usually happens when you run out of system RAM.\n\nQ: exitcode: -7 while using deepspeed\n\nA: Try upgrading deepspeed w: pip install -U deepspeed\n\nQ: AttributeError: ‘DummyOptim’ object has no attribute ‘step’\nQ: ModuleNotFoundError: No module named ‘mpi4py’ using single GPU with deepspeed\n\nA: You may be using deepspeed with single gpu. Please remove the deepspeed: section in the yaml file or --deepspeed CLI flag.\n\nQ: The codes is stuck on saving preprocessed datasets.\n\nA: This is usually an issue with the GPU. This can be resolved through setting the os environment variable CUDA_VISIBLE_DEVICES=0. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.\n\nQ: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.\n\nA: This is likely due to vocab size mismatch. By default, Axolotl expands the model’s embeddings if the tokenizer has more tokens than the model. Please use the axolotl merge-lora command to merge the adapters instead of using your own scripts.\n\n\nOn the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model’s embeddings unless shrink_embeddings: true is set in the config.\n\nQ: How to call Axolotl via custom python scripts?\n\nA: Since Axolotl is just Python, please see src/axolotl/cli/main.py on how each command is called.\n\nQ: How to know the value to use for fsdp_transformer_layer_cls_to_wrap?\n\nA: This is the class name of the transformer layer to wrap with FSDP. For example, for LlamaForCausalLM, the value is LlamaDecoderLayer. To find this for a specific model, check the model’s PreTrainedModel definition and look for _no_split_modules variable in the modeling_&lt;model_name&gt;.py file within transformers library.\n\nQ: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token\n\nA: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:\n\n\nspecial_tokens:\n  # str. If you're not sure, set to same as `eos_token`.\n  pad_token: \"...\"\n\nQ: IterableDataset error or KeyError: 'input_ids' when using preprocess CLI\n\nA: This is because you may be using preprocess CLI with pretraining_dataset: or skip_prepare_dataset: true respectively. Please use axolotl train CLI directly instead as these datasets are prepared on demand.\n\nQ: vLLM is not working with Axolotl\n\nA: We currently recommend torch 2.6.0 for use with vllm. Please ensure you use the right version. For Docker, please use the main-py3.11-cu124-2.6.0 tag.\n\nQ: FA2 2.8.0 undefined symbol runtime error on CUDA 12.4\n\nA: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.\n\n\n\nChat templates\nQ: jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____\n\nA: This means that the property mapping for the stated attribute does not exist when building chat_template prompt. For example, if no attribute 'content', please check you have added the correct mapping for content under message_property_mappings.\n\nQ: Empty template generated for turn ___\n\nA: The content is empty for that turn.\n\nQ: Could not find content start/end boundary for turn __\n\nA: The specific turn’s start/end could not be detected. Please ensure you have set the eos_token following your chat_template. Otherwise, this could be a chat_template which doesn’t use proper boundaries for each turn (like system). On the rare occurrence, make sure your content is not [[dummy_message]]. Please let us know about this.\n\nQ: Content end boundary is before start boundary for turn ___\n\nA: This is an edge case which should not occur. Please create an Issue if this happens.\n\nQ: Content end boundary is the same as start boundary for turn ___. This is likely an empty turn.\n\nA: This is likely an empty turn.\n\nQ: The EOS token is incorrectly being masked or not being masked / EOS token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThis is because of the mismatch between tokenizer.eos_token and EOS token in template. Please make sure to set eos_token: under special_tokens: to the same EOS token as in template.\n\n\n\n\nThe EOS token is not in the template. Please check if your template is correct. As an example, phi_35 template does not use its dedicated EOS token &lt;|endoftext|&gt; at the end.\n\n\nQ: “chat_template choice is tokenizer_default but tokenizer’s chat_template is null. Please add a chat_template in tokenizer config”\n\nA: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See chat_template for more details.\n\nQ: The EOT token(s) are incorrectly being masked or not being masked / EOT token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThe EOT token is different from the EOS token and was not specified under eot_tokens:. Please set eot_tokens: to the same EOT token(s) as in template.\n\n\n\n\nThere is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.\n\n\nQ: EOT token encoding failed. Please check if the token is valid and can be encoded.\n\nA: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.\n\nQ: EOT token __ is encoded as multiple tokens.\n\nA: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:.\n\nQ: Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot\n\nA: This is because the EOS token is in the eot_tokens: while mismatch between train_on_eos: and train_on_eot:. This will cause one to override the other. Please ensure that train_on_eos: and train_on_eot: are the same or remove the EOS token from eot_tokens:.\n\nQ: If eot_tokens: is not provided, what happens?\n\nA: If eot_tokens: is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.\n\n\nInternally, eot_tokens: tokenizer.eos_token and train_on_eot: train_on_eos (which defaults to turn). This transition helps clarify the naming and behavior of EOT/EOS tokens.\n\nQ: Data processing error: CAS service error\n\nA: Try disabling XET with export HF_HUB_DISABLE_XET=1\n\nQ: torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice.\n\nA: Depending on the version of torch, you may need to include this in your YAML:\n\n\nflex_attn_compile_kwargs:\n  dynamic: false\n  mode: max-autotune-no-cudagraphs\n\n**Q: ValueError(\"Backward pass should have cleared tracker of all tensors\")\n\nA: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with offload_activations: legacy in your YAML.",
+    "crumbs": [
+      "Troubleshooting",
+      "FAQ"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_preprocessing.html",
+    "href": "docs/dataset_preprocessing.html",
+    "title": "Dataset Preprocessing",
+    "section": "",
+    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
+    "crumbs": [
+      "Core Concepts",
+      "Dataset Preprocessing"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_preprocessing.html#overview",
+    "href": "docs/dataset_preprocessing.html#overview",
+    "title": "Dataset Preprocessing",
+    "section": "",
+    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
+    "crumbs": [
+      "Core Concepts",
+      "Dataset Preprocessing"
+    ]
+  },
+  {
+    "objectID": "docs/nccl.html",
+    "href": "docs/nccl.html",
+    "title": "NCCL",
+    "section": "",
+    "text": "NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several environment variables. A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:\nWatchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.\nOften, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends disabling PCI access control services (ACS) as a possible solution if this is available to you.\nForcing cross-GPU communication via NVLink may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:\nnvidia-smi nvlink --status\nTo force NCCL to use NVLink, simply set this in the environment:\nexport NCCL_P2P_LEVEL=NVL\nIf NVLink is not available in your environment there are other options for NCCL_P2P_LEVEL in the table below:\n\n\n\n\n\n\n\nNCCL_P2P_LEVEL\nDescription\n\n\n\n\nPIX\nP2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication.\n\n\nPXB\nP2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency.\n\n\nPHB\nP2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL)\n\n\n\nTo validate that acceptable data transfer speeds exist for your training job, running NCCL Tests can help pinpoint bottlenecks, for example:\n./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3\nIt can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:\nexport NCCL_DEBUG=INFO\nexport NCCL_DEBUG_SUBSYS=ALL\nexport TORCH_DISTRIBUTED_DEBUG=INFO\nexport TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log\nFinally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ddp_timeout value in the Axolotl configuration. See PyTorch init_process_group for documentation on this value.",
+    "crumbs": [
+      "Troubleshooting",
+      "NCCL"
+    ]
+  },
+  {
+    "objectID": "docs/cli.html",
+    "href": "docs/cli.html",
+    "title": "Command Line Interface (CLI)",
+    "section": "",
+    "text": "The Axolotl CLI provides a streamlined interface for training and fine-tuning large language models. This guide covers\nthe CLI commands, their usage, and common examples.",
+    "crumbs": [
+      "Getting Started",
+      "Command Line Interface (CLI)"
+    ]
+  },
+  {
+    "objectID": "docs/cli.html#basic-commands",
+    "href": "docs/cli.html#basic-commands",
+    "title": "Command Line Interface (CLI)",
+    "section": "Basic Commands",
+    "text": "Basic Commands\nAll Axolotl commands follow this general structure:\naxolotl &lt;command&gt; [config.yml] [options]\nThe config file can be local or a URL to a raw YAML file.\n\nLauncher Arguments\nFor commands that support multi-GPU (train, evaluate, …), you can pass launcher-specific arguments using the -- separator:\n# Pass torchrun arguments\naxolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1\n\n# Pass accelerate arguments\naxolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml --num_processes=4\nArguments after -- are passed directly to the launcher (torchrun, accelerate launch, etc.).",
+    "crumbs": [
+      "Getting Started",
+      "Command Line Interface (CLI)"
+    ]
+  },
+  {
+    "objectID": "docs/cli.html#command-reference",
+    "href": "docs/cli.html#command-reference",
+    "title": "Command Line Interface (CLI)",
+    "section": "Command Reference",
+    "text": "Command Reference\n\nfetch\nDownloads example configurations and deepspeed configs to your local machine.\n# Get example YAML files\naxolotl fetch examples\n\n# Get deepspeed config files\naxolotl fetch deepspeed_configs\n\n# Specify custom destination\naxolotl fetch examples --dest path/to/folder\n\n\npreprocess\nPreprocesses and tokenizes your dataset before training. This is recommended for large datasets.\n# Basic preprocessing\naxolotl preprocess config.yml\n\n# Preprocessing with one GPU\nCUDA_VISIBLE_DEVICES=\"0\" axolotl preprocess config.yml\n\n# Debug mode to see processed examples\naxolotl preprocess config.yml --debug\n\n# Debug with limited examples\naxolotl preprocess config.yml --debug --debug-num-examples 5\nConfiguration options:\ndataset_prepared_path: Local folder for saving preprocessed data\npush_dataset_to_hub: HuggingFace repo to push preprocessed data (optional)\n\n\ntrain\nTrains or fine-tunes a model using the configuration specified in your YAML file.\n# Basic training\naxolotl train config.yml\n\n# Train and set/override specific options\naxolotl train config.yml \\\n    --learning-rate 1e-4 \\\n    --micro-batch-size 2 \\\n    --num-epochs 3\n\n# Training without accelerate\naxolotl train config.yml --launcher python\n\n# Pass launcher-specific arguments using -- separator\naxolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1\naxolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml\n\n# Resume training from checkpoint\naxolotl train config.yml --resume-from-checkpoint path/to/checkpoint\nIt is possible to run sweeps over multiple hyperparameters by passing in a sweeps config.\n# Basic training with sweeps\naxolotl train config.yml --sweep path/to/sweep.yaml\nExample sweep config:\n_:\n  # This section is for dependent variables we need to fix\n  - load_in_8bit: false\n    load_in_4bit: false\n    adapter: lora\n  - load_in_8bit: true\n    load_in_4bit: false\n    adapter: lora\n\n# These are independent variables\nlearning_rate: [0.0003, 0.0006]\nlora_r:\n  - 16\n  - 32\nlora_alpha:\n  - 16\n  - 32\n  - 64\n\n\ninference\nRuns inference using your trained model in either CLI or Gradio interface mode.\n# CLI inference with LoRA\naxolotl inference config.yml --lora-model-dir=\"./outputs/lora-out\"\n\n# CLI inference with full model\naxolotl inference config.yml --base-model=\"./completed-model\"\n\n# Gradio web interface\naxolotl inference config.yml --gradio \\\n    --lora-model-dir=\"./outputs/lora-out\"\n\n# Inference with input from file\ncat prompt.txt | axolotl inference config.yml \\\n    --base-model=\"./completed-model\"\n\n\nmerge-lora\nMerges trained LoRA adapters into the base model.\n# Basic merge\naxolotl merge-lora config.yml\n\n# Specify LoRA directory (usually used with checkpoints)\naxolotl merge-lora config.yml --lora-model-dir=\"./lora-output/checkpoint-100\"\n\n# Merge using CPU (if out of GPU memory)\nCUDA_VISIBLE_DEVICES=\"\" axolotl merge-lora config.yml\nConfiguration options:\ngpu_memory_limit: Limit GPU memory usage\nlora_on_cpu: Load LoRA weights on CPU\n\n\nmerge-sharded-fsdp-weights\nMerges sharded FSDP model checkpoints into a single combined checkpoint.\n# Basic merge\naxolotl merge-sharded-fsdp-weights config.yml\n\n\nevaluate\nEvaluates a model’s performance (loss etc) on the train and eval datasets.\n# Basic evaluation\naxolotl evaluate config.yml\n\n# Evaluation with launcher arguments\naxolotl evaluate config.yml --launcher torchrun -- --nproc_per_node=2\n\n\nlm-eval\nRuns LM Evaluation Harness on your model.\n# Basic evaluation\naxolotl lm-eval config.yml\nConfiguration options:\n# List of tasks to evaluate\nlm_eval_tasks:\n  - arc_challenge\n  - hellaswag\nlm_eval_batch_size: # Batch size for evaluation\noutput_dir: # Directory to save evaluation results\nSee LM Eval Harness for more details.\n\n\ndelinearize-llama4\nDelinearizes a Llama 4 linearized model into a regular HuggingFace Llama 4 model. This only works with the non-quantized linearized model.\naxolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir\nThis would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.\n\n\nquantize\nQuantizes a model using the quantization configuration specified in your YAML file.\naxolotl quantize config.yml\nSee Quantization for more details.",
+    "crumbs": [
+      "Getting Started",
+      "Command Line Interface (CLI)"
+    ]
+  },
+  {
+    "objectID": "docs/cli.html#legacy-cli-usage",
+    "href": "docs/cli.html#legacy-cli-usage",
+    "title": "Command Line Interface (CLI)",
+    "section": "Legacy CLI Usage",
+    "text": "Legacy CLI Usage\nWhile the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:\n# Preprocess\npython -m axolotl.cli.preprocess config.yml\n\n# Train\naccelerate launch -m axolotl.cli.train config.yml\n\n# Inference\naccelerate launch -m axolotl.cli.inference config.yml \\\n    --lora_model_dir=\"./outputs/lora-out\"\n\n# Gradio interface\naccelerate launch -m axolotl.cli.inference config.yml \\\n    --lora_model_dir=\"./outputs/lora-out\" --gradio\n\n\n\n\n\n\nImportant\n\n\n\nWhen overriding CLI parameters in the legacy CLI, use same notation as in yaml file (e.g., --lora_model_dir).\nNote: This differs from the new Click-based CLI, which uses dash notation (e.g., --lora-model-dir). Keep this in mind if you’re referencing newer documentation or switching between CLI versions.",
+    "crumbs": [
+      "Getting Started",
+      "Command Line Interface (CLI)"
+    ]
+  },
+  {
+    "objectID": "docs/cli.html#remote-compute-with-modal-cloud",
+    "href": "docs/cli.html#remote-compute-with-modal-cloud",
+    "title": "Command Line Interface (CLI)",
+    "section": "Remote Compute with Modal Cloud",
+    "text": "Remote Compute with Modal Cloud\nAxolotl supports running training and inference workloads on Modal cloud infrastructure. This is configured using a\ncloud YAML file alongside your regular Axolotl config.\n\nCloud Configuration\nCreate a cloud config YAML with your Modal settings:\n# cloud_config.yml\nprovider: modal\ngpu: a100       # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4\ngpu_count: 1    # Number of GPUs to use\ntimeout: 86400  # Maximum runtime in seconds (24 hours)\nbranch: main    # Git branch to use (optional)\n\nvolumes:        # Persistent storage volumes\n  - name: axolotl-cache\n    mount: /workspace/cache\n  - name: axolotl-data\n    mount: /workspace/data\n  - name: axolotl-artifacts\n    mount: /workspace/artifacts\n\nsecrets:        # Secrets to inject\n  - WANDB_API_KEY\n  - HF_TOKEN\n\n\nRunning on Modal Cloud\nCommands that support the –cloud flag:\n# Preprocess on cloud\naxolotl preprocess config.yml --cloud cloud_config.yml\n\n# Train on cloud\naxolotl train config.yml --cloud cloud_config.yml\n\n# Run lm-eval on cloud\naxolotl lm-eval config.yml --cloud cloud_config.yml\n\n\nCloud Configuration Options\nprovider:    # compute provider, currently only `modal` is supported\ngpu:         # GPU type to use\ngpu_count:   # Number of GPUs (default: 1)\nmemory:      # RAM in GB (default: 128)\ntimeout:     # Maximum runtime in seconds\ntimeout_preprocess: # Preprocessing timeout\nbranch:      # Git branch to use\ndocker_tag:  # Custom Docker image tag\nvolumes:     # List of persistent storage volumes\n\n# Environment variables to pass. Can be specified in two ways:\n# 1. As a string: Will load the value from the host computer's environment variables\n# 2. As a key-value pair: Will use the specified value directly\n# Example:\n# env:\n#   - CUSTOM_VAR  # Loads from host's $CUSTOM_VAR\n#   - {CUSTOM_VAR: \"value\"}  # Uses \"value\" directly\nenv:\n\n# Secrets to inject. Same input format as `env` but for sensitive data.\nsecrets:\n  # - HF_TOKEN\n  # - WANDB_API_KEY",
+    "crumbs": [
+      "Getting Started",
+      "Command Line Interface (CLI)"
+    ]
+  },
+  {
+    "objectID": "docs/torchao.html",
+    "href": "docs/torchao.html",
+    "title": "PyTorch ao",
+    "section": "",
+    "text": "To use experimental optimizers (AdamWFp8, AdamW4bit, AdamW8bit) from Pytorch Ao, please install the package as shown below.\n\n\n\n\n\n\nTip\n\n\n\nSome experimental optimizers are already present in regular Pytorch, so please re-check if you actually need this package!\n\n\n\nInstallation\nStable Release from the PyTorch index\npip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124\nNightly release\npip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124",
+    "crumbs": [
+      "Advanced Features",
+      "PyTorch ao"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html",
+    "href": "docs/multi-gpu.html",
+    "title": "Multi-GPU",
+    "section": "",
+    "text": "This guide covers advanced training configurations for multi-GPU setups using Axolotl.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-overview",
+    "href": "docs/multi-gpu.html#sec-overview",
+    "title": "Multi-GPU",
+    "section": "1 Overview",
+    "text": "1 Overview\nAxolotl supports several methods for multi-GPU training:\n\nDeepSpeed (recommended)\nFSDP (Fully Sharded Data Parallel)\nSequence parallelism\nFSDP + QLoRA",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-deepspeed",
+    "href": "docs/multi-gpu.html#sec-deepspeed",
+    "title": "Multi-GPU",
+    "section": "2 DeepSpeed",
+    "text": "2 DeepSpeed\n\n2.1 Configuration\nAdd to your YAML config:\ndeepspeed: deepspeed_configs/zero1.json\n\n\n2.2 Usage\n# Fetch deepspeed configs (if not already present)\naxolotl fetch deepspeed_configs\n\n# Passing arg via config\naxolotl train config.yml\n\n# Passing arg via cli\naxolotl train config.yml --deepspeed deepspeed_configs/zero1.json\n\n\n2.3 ZeRO Stages\nWe provide default configurations for:\n\nZeRO Stage 1 (zero1.json)\nZeRO Stage 1 with torch compile (zero1_torch_compile.json)\nZeRO Stage 2 (zero2.json)\nZeRO Stage 3 (zero3.json)\nZeRO Stage 3 with bf16 (zero3_bf16.json)\nZeRO Stage 3 with bf16 and CPU offload params(zero3_bf16_cpuoffload_params.json)\nZeRO Stage 3 with bf16 and CPU offload params and optimizer (zero3_bf16_cpuoffload_all.json)\n\n\n\n\n\n\n\nTip\n\n\n\nChoose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.\nStart from Stage 1 -&gt; Stage 2 -&gt; Stage 3.\n\n\n\n\n\n\n\n\nTip\n\n\n\nUsing ZeRO Stage 3 with Single-GPU training\nZeRO Stage 3 can be used for training on a single GPU by manually setting the environment variables:\nWORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=0.0.0.0 MASTER_PORT=29500",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-fsdp",
+    "href": "docs/multi-gpu.html#sec-fsdp",
+    "title": "Multi-GPU",
+    "section": "3 Fully Sharded Data Parallel (FSDP)",
+    "text": "3 Fully Sharded Data Parallel (FSDP)\n\n\n\n\n\n\nNote\n\n\n\nFSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.\n\n\n\n3.1 Migrating from FSDP1 to FSDP2\nTo migrate your config from FSDP1 to FSDP2, you must use the fsdp_version top-level config field to specify the FSDP version, and\nalso follow the config field mapping below to update field names.\n\n3.1.1 Config mapping\n\n\n\nFSDP1\nFSDP2\n\n\n\n\nfsdp_sharding_strategy\nreshard_after_forward\n\n\nfsdp_backward_prefetch_policy\nREMOVED\n\n\nfsdp_backward_prefetch\nREMOVED\n\n\nfsdp_forward_prefetch\nREMOVED\n\n\nfsdp_sync_module_states\nREMOVED\n\n\nfsdp_cpu_ram_efficient_loading\ncpu_ram_efficient_loading\n\n\nfsdp_state_dict_type\nstate_dict_type\n\n\nfsdp_use_orig_params\nREMOVED\n\n\n\nFor more details, please see the migration guide in the torchtitan repo. In Axolotl,\nif you were using the following FSDP1 config:\nfsdp_version: 1\nfsdp_config:\n  fsdp_offload_params: false\n  fsdp_cpu_ram_efficient_loading: true\n  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_sharding_strategy: FULL_SHARD\nYou can migrate to the following FSDP2 config:\nfsdp_version: 2\nfsdp_config:\n  offload_params: false\n  cpu_ram_efficient_loading: true\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  state_dict_type: FULL_STATE_DICT\n  reshard_after_forward: true\n\n\n\n3.2 FSDP1 (deprecated)\n\n\n\n\n\n\nNote\n\n\n\nUsing fsdp to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use fsdp_config as above instead.\n\n\nfsdp:\n  - full_shard\n  - auto_wrap\nfsdp_config:\n  fsdp_offload_params: true\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-sequence-parallelism",
+    "href": "docs/multi-gpu.html#sec-sequence-parallelism",
+    "title": "Multi-GPU",
+    "section": "4 Sequence parallelism",
+    "text": "4 Sequence parallelism\nWe support sequence parallelism (SP) via the\nring-flash-attention project. This\nallows one to split up sequences across GPUs, which is useful in the event that a\nsingle sequence causes OOM errors during model training.\nSee our dedicated guide for more information.\n\n4.1 FSDP + QLoRA\nFor combining FSDP with QLoRA, see our dedicated guide.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-performance",
+    "href": "docs/multi-gpu.html#sec-performance",
+    "title": "Multi-GPU",
+    "section": "5 Performance Optimization",
+    "text": "5 Performance Optimization\n\n5.1 Liger Kernel Integration\nPlease see docs for more info.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-troubleshooting",
+    "href": "docs/multi-gpu.html#sec-troubleshooting",
+    "title": "Multi-GPU",
+    "section": "6 Troubleshooting",
+    "text": "6 Troubleshooting\n\n6.1 NCCL Issues\nFor NCCL-related problems, see our NCCL troubleshooting guide.\n\n\n6.2 Common Problems\n\nMemory IssuesTraining Instability\n\n\n\nReduce micro_batch_size\nReduce eval_batch_size\nAdjust gradient_accumulation_steps\nConsider using a higher ZeRO stage\n\n\n\n\nStart with DeepSpeed ZeRO-2\nMonitor loss values\nCheck learning rates\n\n\n\n\nFor more detailed troubleshooting, see our debugging guide.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
   {
     "objectID": "docs/rlhf.html",
     "href": "docs/rlhf.html",
diff --git a/sitemap.xml b/sitemap.xml
index c0e27cb89..2c76d6168 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,786 +2,790 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://docs.axolotl.ai/TODO.html</loc>
-    <lastmod>2025-07-31T22:18:52.245Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.023Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/index.html</loc>
-    <lastmod>2025-07-31T22:18:52.265Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.044Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/debugging.html</loc>
-    <lastmod>2025-07-31T22:18:52.247Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.025Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/amd_hpc.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.024Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
-    <lastmod>2025-07-31T22:22:20.582Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.992Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html</loc>
-    <lastmod>2025-07-31T22:22:20.009Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.411Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.patch_manager.html</loc>
-    <lastmod>2025-07-31T22:22:19.620Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.013Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html</loc>
-    <lastmod>2025-07-31T22:22:19.305Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.693Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
-    <lastmod>2025-07-31T22:22:19.361Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.749Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
-    <lastmod>2025-07-31T22:22:20.574Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.984Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
-    <lastmod>2025-07-31T22:22:19.302Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.690Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html</loc>
-    <lastmod>2025-07-31T22:22:20.579Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.989Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
-    <lastmod>2025-07-31T22:22:19.454Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.844Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
-    <lastmod>2025-07-31T22:22:20.068Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.471Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
-    <lastmod>2025-07-31T22:22:20.105Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.508Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
-    <lastmod>2025-07-31T22:22:19.306Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.694Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html</loc>
-    <lastmod>2025-07-31T22:22:19.627Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.020Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
-    <lastmod>2025-07-31T22:22:20.522Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.931Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
-    <lastmod>2025-07-31T22:22:19.251Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.638Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
-    <lastmod>2025-07-31T22:22:20.527Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.936Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html</loc>
-    <lastmod>2025-07-31T22:22:19.748Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.145Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
-    <lastmod>2025-07-31T22:22:19.961Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.362Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
-    <lastmod>2025-07-31T22:22:19.782Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.178Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
-    <lastmod>2025-07-31T22:22:19.952Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.352Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
-    <lastmod>2025-07-31T22:22:20.483Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.891Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
-    <lastmod>2025-07-31T22:22:19.477Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.868Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
-    <lastmod>2025-07-31T22:22:19.845Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.242Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.rl.html</loc>
-    <lastmod>2025-07-31T22:22:19.266Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.654Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
-    <lastmod>2025-07-31T22:22:20.196Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.601Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html</loc>
-    <lastmod>2025-07-31T22:22:20.307Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.713Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
-    <lastmod>2025-07-31T22:22:19.585Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.978Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
-    <lastmod>2025-07-31T22:22:20.057Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.459Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.evaluate.html</loc>
-    <lastmod>2025-07-31T22:22:19.369Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.757Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.causal.html</loc>
-    <lastmod>2025-07-31T22:22:19.262Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.649Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
-    <lastmod>2025-07-31T22:22:20.004Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.406Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html</loc>
-    <lastmod>2025-07-31T22:22:20.048Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.450Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html</loc>
-    <lastmod>2025-07-31T22:22:19.420Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.809Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.trl.html</loc>
-    <lastmod>2025-07-31T22:22:20.290Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.696Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
-    <lastmod>2025-07-31T22:22:19.804Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.201Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
-    <lastmod>2025-07-31T22:22:20.470Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.878Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html</loc>
-    <lastmod>2025-07-31T22:22:20.097Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.500Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
-    <lastmod>2025-07-31T22:22:20.204Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.609Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
-    <lastmod>2025-07-31T22:22:20.066Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.469Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
-    <lastmod>2025-07-31T22:22:19.483Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.874Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
-    <lastmod>2025-07-31T22:22:19.708Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.104Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
-    <lastmod>2025-07-31T22:22:20.127Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.531Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
-    <lastmod>2025-07-31T22:22:19.848Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.246Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
-    <lastmod>2025-07-31T22:22:20.458Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.866Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html</loc>
-    <lastmod>2025-07-31T22:22:20.065Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.467Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
-    <lastmod>2025-07-31T22:22:19.823Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.220Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
-    <lastmod>2025-07-31T22:22:19.352Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.740Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
-    <lastmod>2025-07-31T22:22:20.497Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.906Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
-    <lastmod>2025-07-31T22:22:19.166Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.552Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
-    <lastmod>2025-07-31T22:22:20.143Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.548Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
-    <lastmod>2025-07-31T22:22:19.742Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.139Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
-    <lastmod>2025-07-31T22:22:19.104Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.490Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
-    <lastmod>2025-07-31T22:22:19.695Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.090Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.training_args.html</loc>
-    <lastmod>2025-07-31T22:22:19.279Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.666Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.quantize.html</loc>
-    <lastmod>2025-07-31T22:22:19.960Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.360Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/convert.html</loc>
-    <lastmod>2025-07-31T22:22:19.200Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.587Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
-    <lastmod>2025-07-31T22:22:20.462Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.870Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html</loc>
-    <lastmod>2025-07-31T22:22:19.759Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.155Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.model.html</loc>
-    <lastmod>2025-07-31T22:22:20.253Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.659Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html</loc>
-    <lastmod>2025-07-31T22:22:20.593Z</lastmod>
+    <lastmod>2025-08-01T06:21:53.003Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.constants.html</loc>
-    <lastmod>2025-07-31T22:22:19.621Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.015Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.sweeps.html</loc>
-    <lastmod>2025-07-31T22:22:19.513Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.904Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
-    <lastmod>2025-07-31T22:22:19.792Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.189Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html</loc>
-    <lastmod>2025-07-31T22:22:19.319Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.707Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.fetch.html</loc>
-    <lastmod>2025-07-31T22:22:19.501Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.892Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
-    <lastmod>2025-07-31T22:22:19.554Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.946Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
-    <lastmod>2025-07-31T22:22:20.317Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.724Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
-    <lastmod>2025-07-31T22:22:20.577Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.987Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html</loc>
-    <lastmod>2025-07-31T22:22:19.765Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.162Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
-    <lastmod>2025-07-31T22:22:19.549Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.941Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
-    <lastmod>2025-07-31T22:22:19.769Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.166Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
-    <lastmod>2025-07-31T22:22:20.567Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.977Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
-    <lastmod>2025-07-31T22:22:20.171Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.575Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
-    <lastmod>2025-07-31T22:22:19.572Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.964Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_tokenizers.html</loc>
-    <lastmod>2025-07-31T22:22:19.242Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.629Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/config-reference.html</loc>
-    <lastmod>2025-07-31T22:22:33.863Z</lastmod>
+    <lastmod>2025-08-01T06:22:06.060Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multimodal.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/mixed_precision.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/unsloth.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.029Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/ray-integration.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.024Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/template_free.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.024Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/index.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.024Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/pretraining.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.024Z</lastmod>
   </url>
   <url>
-    <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/cli.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/nccl.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset_preprocessing.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/faq.html</loc>
-    <lastmod>2025-07-31T22:18:52.247Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/qat.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/gradient_checkpointing.html</loc>
-    <lastmod>2025-07-31T22:18:52.247Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/input_output.html</loc>
-    <lastmod>2025-07-31T22:18:52.249Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html</loc>
-    <lastmod>2025-07-31T22:18:52.270Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
-    <lastmod>2025-07-31T22:18:52.270Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/mac.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
-    <lastmod>2025-07-31T22:18:52.247Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/lora_optims.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/multi-node.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/fsdp_qlora.html</loc>
-    <lastmod>2025-07-31T22:18:52.247Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/inference.html</loc>
-    <lastmod>2025-07-31T22:18:52.249Z</lastmod>
+    <loc>https://docs.axolotl.ai/docs/nd_parallelism.html</loc>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/sequence_parallelism.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/inference.html</loc>
+    <lastmod>2025-08-01T06:18:40.027Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/fsdp_qlora.html</loc>
+    <lastmod>2025-08-01T06:18:40.025Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multi-node.html</loc>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/lora_optims.html</loc>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
+    <lastmod>2025-08-01T06:18:40.025Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
+    <lastmod>2025-08-01T06:18:40.025Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/mac.html</loc>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
+    <lastmod>2025-08-01T06:18:40.048Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html</loc>
+    <lastmod>2025-08-01T06:18:40.048Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/input_output.html</loc>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/gradient_checkpointing.html</loc>
+    <lastmod>2025-08-01T06:18:40.025Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/qat.html</loc>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/faq.html</loc>
+    <lastmod>2025-08-01T06:18:40.025Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset_preprocessing.html</loc>
+    <lastmod>2025-08-01T06:18:40.025Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/nccl.html</loc>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/cli.html</loc>
+    <lastmod>2025-08-01T06:18:40.024Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
+    <lastmod>2025-08-01T06:18:40.029Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/rlhf.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/tokenized.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.024Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/conversation.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.024Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.024Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/reward_modelling.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/docker.html</loc>
-    <lastmod>2025-07-31T22:18:52.247Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.025Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/installation.html</loc>
-    <lastmod>2025-07-31T22:18:52.249Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/quantize.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.024Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/batch_vs_grad.html</loc>
-    <lastmod>2025-07-31T22:18:52.246Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.024Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.train.html</loc>
-    <lastmod>2025-07-31T22:22:19.523Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.915Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.art.html</loc>
-    <lastmod>2025-07-31T22:22:19.391Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.780Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html</loc>
-    <lastmod>2025-07-31T22:22:19.584Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.976Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.model.html</loc>
-    <lastmod>2025-07-31T22:22:19.595Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.988Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.preprocess.html</loc>
-    <lastmod>2025-07-31T22:22:19.462Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.852Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
-    <lastmod>2025-07-31T22:22:19.484Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.875Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
-    <lastmod>2025-07-31T22:22:19.434Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.823Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
-    <lastmod>2025-07-31T22:22:20.047Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.449Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
-    <lastmod>2025-07-31T22:22:19.187Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.574Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html</loc>
-    <lastmod>2025-07-31T22:22:20.063Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.466Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
-    <lastmod>2025-07-31T22:22:19.987Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.388Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.relora.html</loc>
-    <lastmod>2025-07-31T22:22:20.007Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.409Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
-    <lastmod>2025-07-31T22:22:20.054Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.456Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.adapter.html</loc>
-    <lastmod>2025-07-31T22:22:19.610Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.003Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html</loc>
-    <lastmod>2025-07-31T22:22:19.561Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.953Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
-    <lastmod>2025-07-31T22:22:20.461Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.869Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
-    <lastmod>2025-07-31T22:22:20.045Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.447Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.processor.html</loc>
-    <lastmod>2025-07-31T22:22:19.605Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.997Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
-    <lastmod>2025-07-31T22:22:19.415Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.804Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.liger.args.html</loc>
-    <lastmod>2025-07-31T22:22:20.473Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.881Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.tokenizer.html</loc>
-    <lastmod>2025-07-31T22:22:19.603Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.996Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
-    <lastmod>2025-07-31T22:22:20.246Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.652Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
-    <lastmod>2025-07-31T22:22:19.661Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.055Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
-    <lastmod>2025-07-31T22:22:19.637Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.030Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
-    <lastmod>2025-07-31T22:22:19.534Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.926Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.args.html</loc>
-    <lastmod>2025-07-31T22:22:19.496Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.887Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html</loc>
-    <lastmod>2025-07-31T22:22:19.780Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.177Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
-    <lastmod>2025-07-31T22:22:20.037Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.439Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
-    <lastmod>2025-07-31T22:22:19.931Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.331Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
-    <lastmod>2025-07-31T22:22:19.473Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.864Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
-    <lastmod>2025-07-31T22:22:20.295Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.701Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
-    <lastmod>2025-07-31T22:22:20.323Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.729Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
-    <lastmod>2025-07-31T22:22:19.988Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.389Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
-    <lastmod>2025-07-31T22:22:20.476Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.885Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
-    <lastmod>2025-07-31T22:22:20.002Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.404Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.core.html</loc>
-    <lastmod>2025-07-31T22:22:20.500Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.909Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html</loc>
-    <lastmod>2025-07-31T22:22:19.304Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.691Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
-    <lastmod>2025-07-31T22:22:19.807Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.204Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
-    <lastmod>2025-07-31T22:22:19.312Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.699Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
-    <lastmod>2025-07-31T22:22:20.119Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.523Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.training.html</loc>
-    <lastmod>2025-07-31T22:22:20.260Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.666Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.batching.html</loc>
-    <lastmod>2025-07-31T22:22:20.519Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.928Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
-    <lastmod>2025-07-31T22:22:19.754Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.151Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.lora.html</loc>
-    <lastmod>2025-07-31T22:22:20.110Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.514Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.base.html</loc>
-    <lastmod>2025-07-31T22:22:19.662Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.057Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
-    <lastmod>2025-07-31T22:22:19.722Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.118Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html</loc>
-    <lastmod>2025-07-31T22:22:20.278Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.683Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
-    <lastmod>2025-07-31T22:22:19.805Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.202Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
-    <lastmod>2025-07-31T22:22:20.286Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.692Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
-    <lastmod>2025-07-31T22:22:19.776Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.173Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.architectures.html</loc>
-    <lastmod>2025-07-31T22:22:20.481Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.890Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html</loc>
-    <lastmod>2025-07-31T22:22:20.071Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.474Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html</loc>
-    <lastmod>2025-07-31T22:22:20.586Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.996Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
-    <lastmod>2025-07-31T22:22:20.480Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.888Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.quantize.html</loc>
-    <lastmod>2025-07-31T22:22:19.467Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.857Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
-    <lastmod>2025-07-31T22:22:19.397Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.786Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html</loc>
-    <lastmod>2025-07-31T22:22:19.815Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.212Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
-    <lastmod>2025-07-31T22:22:20.115Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.519Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.quantization.html</loc>
-    <lastmod>2025-07-31T22:22:20.232Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.638Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html</loc>
-    <lastmod>2025-07-31T22:22:19.630Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.024Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
-    <lastmod>2025-07-31T22:22:19.942Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.342Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.data.pretraining.html</loc>
-    <lastmod>2025-07-31T22:22:20.205Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.610Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
-    <lastmod>2025-07-31T22:22:19.824Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.222Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.base.html</loc>
-    <lastmod>2025-07-31T22:22:19.257Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.644Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.merge_lora.html</loc>
-    <lastmod>2025-07-31T22:22:19.442Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.832Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.load.html</loc>
-    <lastmod>2025-07-31T22:22:19.507Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.898Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
-    <lastmod>2025-07-31T22:22:20.212Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.617Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
-    <lastmod>2025-07-31T22:22:19.730Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.126Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
-    <lastmod>2025-07-31T22:22:20.104Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.507Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
-    <lastmod>2025-07-31T22:22:19.802Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.199Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
-    <lastmod>2025-07-31T22:22:20.498Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.907Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.args.html</loc>
-    <lastmod>2025-07-31T22:22:19.388Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.776Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/evaluate.html</loc>
-    <lastmod>2025-07-31T22:22:19.176Z</lastmod>
+    <lastmod>2025-08-01T06:21:51.563Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
-    <lastmod>2025-07-31T22:22:19.710Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.106Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.distributed.html</loc>
-    <lastmod>2025-07-31T22:22:20.191Z</lastmod>
+    <lastmod>2025-08-01T06:21:52.595Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multipack.html</loc>
-    <lastmod>2025-07-31T22:18:52.250Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.028Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html</loc>
-    <lastmod>2025-07-31T22:18:52.254Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.032Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/FAQS.html</loc>
-    <lastmod>2025-07-31T22:18:52.244Z</lastmod>
+    <lastmod>2025-08-01T06:18:40.023Z</lastmod>
   </url>
 </urlset>
diff --git a/src/axolotl/integrations/LICENSE.html b/src/axolotl/integrations/LICENSE.html
index c45a9adfb..e69f686c9 100644
--- a/src/axolotl/integrations/LICENSE.html
+++ b/src/axolotl/integrations/LICENSE.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>
diff --git a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
index 0dc21c21a..4234c05d9 100644
--- a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
+++ b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
@@ -356,9 +356,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  <span class="menu-text">Mixed Precision Training</span></a>
   </div>
 </li>
-          <li class="sidebar-item">
- <span class="menu-text">docs/gradient_accumulation.qmd</span>
-  </li>
       </ul>
   </li>
         <li class="sidebar-item sidebar-item-section">
@@ -405,6 +402,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism</span></a>
+  </div>
 </li>
       </ul>
   </li>