Built site for gh-pages

2025-11-24 03:27:39 +00:00
parent 24cd17113a
commit cd7fdaeeb6
7 changed files with 436 additions and 420 deletions
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-899be981
+e37ff14d
--- a/docs/custom_integrations.html
+++ b/docs/custom_integrations.html
@@ -619,7 +619,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <ul>
 <li>If you are installing from pip</li>
 </ul>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> uninstall <span class="at">-y</span> cut-cross-entropy <span class="kw">&amp;&amp;</span> <span class="ex">pip3</span> install <span class="st">"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> uninstall <span class="at">-y</span> cut-cross-entropy <span class="kw">&amp;&amp;</span> <span class="ex">pip3</span> install <span class="st">"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@5eff953"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
 <section id="usage" class="level3">
 <h3 class="anchored" data-anchor-id="usage">Usage</h3>
@@ -663,6 +663,9 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <li>mistral3</li>
 <li>mixtral</li>
 <li>mllama</li>
 <li>olmo</li>
 <li>olmo2</li>
 <li>olmo3</li>
 <li>phi</li>
 <li>phi3</li>
 <li>phi4_multimodal</li>
--- a/docs/multi-gpu.html
+++ b/docs/multi-gpu.html
@@ -511,30 +511,28 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
    <h2 id="toc-title">On this page</h2>
  <ul>
-  <li><a href="#sec-overview" id="toc-sec-overview" class="nav-link active" data-scroll-target="#sec-overview"><span class="header-section-number">1</span> Overview</a></li>
+  <li><a href="#sec-overview" id="toc-sec-overview" class="nav-link active" data-scroll-target="#sec-overview">Overview</a></li>
-  <li><a href="#sec-deepspeed" id="toc-sec-deepspeed" class="nav-link" data-scroll-target="#sec-deepspeed"><span class="header-section-number">2</span> DeepSpeed</a>
+  <li><a href="#sec-deepspeed" id="toc-sec-deepspeed" class="nav-link" data-scroll-target="#sec-deepspeed">DeepSpeed</a>
  <ul class="collapse">
-  <li><a href="#sec-deepspeed-config" id="toc-sec-deepspeed-config" class="nav-link" data-scroll-target="#sec-deepspeed-config"><span class="header-section-number">2.1</span> Configuration</a></li>
+  <li><a href="#sec-deepspeed-config" id="toc-sec-deepspeed-config" class="nav-link" data-scroll-target="#sec-deepspeed-config">Configuration</a></li>
-  <li><a href="#sec-deepspeed-usage" id="toc-sec-deepspeed-usage" class="nav-link" data-scroll-target="#sec-deepspeed-usage"><span class="header-section-number">2.2</span> Usage</a></li>
+  <li><a href="#sec-deepspeed-usage" id="toc-sec-deepspeed-usage" class="nav-link" data-scroll-target="#sec-deepspeed-usage">Usage</a></li>
-  <li><a href="#sec-zero-stages" id="toc-sec-zero-stages" class="nav-link" data-scroll-target="#sec-zero-stages"><span class="header-section-number">2.3</span> ZeRO Stages</a></li>
+  <li><a href="#sec-zero-stages" id="toc-sec-zero-stages" class="nav-link" data-scroll-target="#sec-zero-stages">ZeRO Stages</a></li>
  </ul></li>
-  <li><a href="#sec-fsdp" id="toc-sec-fsdp" class="nav-link" data-scroll-target="#sec-fsdp"><span class="header-section-number">3</span> Fully Sharded Data Parallel (FSDP)</a>
+  <li><a href="#sec-fsdp" id="toc-sec-fsdp" class="nav-link" data-scroll-target="#sec-fsdp">Fully Sharded Data Parallel (FSDP)</a>
  <ul class="collapse">
-  <li><a href="#sec-migrate-fsdp1-fsdp2" id="toc-sec-migrate-fsdp1-fsdp2" class="nav-link" data-scroll-target="#sec-migrate-fsdp1-fsdp2"><span class="header-section-number">3.1</span> Migrating from FSDP1 to FSDP2</a></li>
+  <li><a href="#sec-fsdp-qlora" id="toc-sec-fsdp-qlora" class="nav-link" data-scroll-target="#sec-fsdp-qlora">FSDP + QLoRA</a></li>
-  <li><a href="#sec-fsdp-config" id="toc-sec-fsdp-config" class="nav-link" data-scroll-target="#sec-fsdp-config"><span class="header-section-number">3.2</span> FSDP1 (deprecated)</a></li>
+  <li><a href="#sec-migrate-fsdp1-fsdp2" id="toc-sec-migrate-fsdp1-fsdp2" class="nav-link" data-scroll-target="#sec-migrate-fsdp1-fsdp2">Migrating from FSDP1 to FSDP2</a></li>
  <li><a href="#sec-fsdp-config" id="toc-sec-fsdp-config" class="nav-link" data-scroll-target="#sec-fsdp-config">FSDP1 (deprecated)</a></li>
  </ul></li>
-  <li><a href="#sec-sequence-parallelism" id="toc-sec-sequence-parallelism" class="nav-link" data-scroll-target="#sec-sequence-parallelism"><span class="header-section-number">4</span> Sequence parallelism</a>
+  <li><a href="#sec-sequence-parallelism" id="toc-sec-sequence-parallelism" class="nav-link" data-scroll-target="#sec-sequence-parallelism">Sequence parallelism</a></li>
  <li><a href="#sec-performance" id="toc-sec-performance" class="nav-link" data-scroll-target="#sec-performance">Performance Optimization</a>
  <ul class="collapse">
-  <li><a href="#sec-fsdp-qlora" id="toc-sec-fsdp-qlora" class="nav-link" data-scroll-target="#sec-fsdp-qlora"><span class="header-section-number">4.1</span> FSDP + QLoRA</a></li>
+  <li><a href="#sec-liger" id="toc-sec-liger" class="nav-link" data-scroll-target="#sec-liger">Liger Kernel Integration</a></li>
  </ul></li>
-  <li><a href="#sec-performance" id="toc-sec-performance" class="nav-link" data-scroll-target="#sec-performance"><span class="header-section-number">5</span> Performance Optimization</a>
+  <li><a href="#sec-troubleshooting" id="toc-sec-troubleshooting" class="nav-link" data-scroll-target="#sec-troubleshooting">Troubleshooting</a>
  <ul class="collapse">
-  <li><a href="#sec-liger" id="toc-sec-liger" class="nav-link" data-scroll-target="#sec-liger"><span class="header-section-number">5.1</span> Liger Kernel Integration</a></li>
+  <li><a href="#sec-nccl" id="toc-sec-nccl" class="nav-link" data-scroll-target="#sec-nccl">NCCL Issues</a></li>
-  </ul></li>
+  <li><a href="#sec-common-problems" id="toc-sec-common-problems" class="nav-link" data-scroll-target="#sec-common-problems">Common Problems</a></li>
  <li><a href="#sec-troubleshooting" id="toc-sec-troubleshooting" class="nav-link" data-scroll-target="#sec-troubleshooting"><span class="header-section-number">6</span> Troubleshooting</a>
  <ul class="collapse">
  <li><a href="#sec-nccl" id="toc-sec-nccl" class="nav-link" data-scroll-target="#sec-nccl"><span class="header-section-number">6.1</span> NCCL Issues</a></li>
  <li><a href="#sec-common-problems" id="toc-sec-common-problems" class="nav-link" data-scroll-target="#sec-common-problems"><span class="header-section-number">6.2</span> Common Problems</a></li>
  </ul></li>
  </ul>
 </nav>
@@ -562,25 +560,30 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <p>This guide covers advanced training configurations for multi-GPU setups using Axolotl.</p>
-<section id="sec-overview" class="level2" data-number="1">
+<section id="sec-overview" class="level2">
-<h2 data-number="1" class="anchored" data-anchor-id="sec-overview"><span class="header-section-number">1</span> Overview</h2>
+<h2 class="anchored" data-anchor-id="sec-overview">Overview</h2>
-<p>Axolotl supports several methods for multi-GPU training:</p>
+<p>When training on multiple GPUs, Axolotl supports 3 sharding/parallelism strategies. Additionally, you can layer specific optimization features on top of that strategy.</p>
 <p>You generally cannot combine these strategies; they are mutually exclusive.</p>
 <ol type="1">
 <li><strong>DeepSpeed</strong>: Powerful optimization library, supports ZeRO stages 1-3.</li>
 <li><strong>FSDP (Fully Sharded Data Parallel)</strong>: PyTorch’s native sharding implementation (Recommended).</li>
 <li><strong>DDP (Distributed Data Parallel)</strong>: PyTorch’s native parallelism implementation (Default if neither of the above are selected).</li>
 </ol>
 <p>These features can often be combined with the strategies above:</p>
 <ul>
-<li>DeepSpeed (recommended)</li>
+<li><strong>Sequence Parallelism</strong>: Splits long sequences across GPUs (Compatible with DDP, DeepSpeed, and FSDP).</li>
-<li>FSDP (Fully Sharded Data Parallel)</li>
+<li><strong>FSDP + QLoRA</strong>: Combines 4-bit quantization with FSDP (Specific to FSDP).</li>
 <li>Sequence parallelism</li>
 <li>FSDP + QLoRA</li>
 </ul>
 </section>
-<section id="sec-deepspeed" class="level2" data-number="2">
+<section id="sec-deepspeed" class="level2">
-<h2 data-number="2" class="anchored" data-anchor-id="sec-deepspeed"><span class="header-section-number">2</span> DeepSpeed</h2>
+<h2 class="anchored" data-anchor-id="sec-deepspeed">DeepSpeed</h2>
-<section id="sec-deepspeed-config" class="level3" data-number="2.1">
+<section id="sec-deepspeed-config" class="level3">
-<h3 data-number="2.1" class="anchored" data-anchor-id="sec-deepspeed-config"><span class="header-section-number">2.1</span> Configuration</h3>
+<h3 class="anchored" data-anchor-id="sec-deepspeed-config">Configuration</h3>
 <p>Add to your YAML config:</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> deepspeed_configs/zero1.json</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
-<section id="sec-deepspeed-usage" class="level3" data-number="2.2">
+<section id="sec-deepspeed-usage" class="level3">
-<h3 data-number="2.2" class="anchored" data-anchor-id="sec-deepspeed-usage"><span class="header-section-number">2.2</span> Usage</h3>
+<h3 class="anchored" data-anchor-id="sec-deepspeed-usage">Usage</h3>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Fetch deepspeed configs (if not already present)</span></span>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch deepspeed_configs</span>
 <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
@@ -590,8 +593,8 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Passing arg via cli</span></span>
 <span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train config.yml <span class="at">--deepspeed</span> deepspeed_configs/zero1.json</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
-<section id="sec-zero-stages" class="level3" data-number="2.3">
+<section id="sec-zero-stages" class="level3">
-<h3 data-number="2.3" class="anchored" data-anchor-id="sec-zero-stages"><span class="header-section-number">2.3</span> ZeRO Stages</h3>
+<h3 class="anchored" data-anchor-id="sec-zero-stages">ZeRO Stages</h3>
 <p>We provide default configurations for:</p>
 <ul>
 <li>ZeRO Stage 1 (<code>zero1.json</code>)</li>
@@ -618,8 +621,9 @@ Tip
 </div>
 </section>
 </section>
-<section id="sec-fsdp" class="level2" data-number="3">
+<section id="sec-fsdp" class="level2">
-<h2 data-number="3" class="anchored" data-anchor-id="sec-fsdp"><span class="header-section-number">3</span> Fully Sharded Data Parallel (FSDP)</h2>
+<h2 class="anchored" data-anchor-id="sec-fsdp">Fully Sharded Data Parallel (FSDP)</h2>
 <p>FSDP allows you to shard model parameters, gradients, and optimizer states across data parallel workers.</p>
 <div class="callout callout-style-default callout-note callout-titled">
 <div class="callout-header d-flex align-content-center">
 <div class="callout-icon-container">
@@ -633,12 +637,16 @@ Note
 <p>FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.</p>
 </div>
 </div>
-<section id="sec-migrate-fsdp1-fsdp2" class="level3" data-number="3.1">
+<section id="sec-fsdp-qlora" class="level3">
-<h3 data-number="3.1" class="anchored" data-anchor-id="sec-migrate-fsdp1-fsdp2"><span class="header-section-number">3.1</span> Migrating from FSDP1 to FSDP2</h3>
+<h3 class="anchored" data-anchor-id="sec-fsdp-qlora">FSDP + QLoRA</h3>
 <p>For combining FSDP with QLoRA, see our <a href="../docs/fsdp_qlora.html">dedicated guide</a>.</p>
 </section>
 <section id="sec-migrate-fsdp1-fsdp2" class="level3">
 <h3 class="anchored" data-anchor-id="sec-migrate-fsdp1-fsdp2">Migrating from FSDP1 to FSDP2</h3>
 <p>To migrate your config from FSDP1 to FSDP2, you must use the <code>fsdp_version</code> top-level config field to specify the FSDP version, and
 also follow the config field mapping below to update field names.</p>
-<section id="config-mapping" class="level4" data-number="3.1.1">
+<section id="config-mapping" class="level4">
-<h4 data-number="3.1.1" class="anchored" data-anchor-id="config-mapping"><span class="header-section-number">3.1.1</span> Config mapping</h4>
+<h4 class="anchored" data-anchor-id="config-mapping">Config mapping</h4>
 <table class="caption-top table">
 <thead>
 <tr class="header">
@@ -706,8 +714,8 @@ if you were using the following FSDP1 config:</p>
 <span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">reshard_after_forward</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
 </section>
-<section id="sec-fsdp-config" class="level3" data-number="3.2">
+<section id="sec-fsdp-config" class="level3">
-<h3 data-number="3.2" class="anchored" data-anchor-id="sec-fsdp-config"><span class="header-section-number">3.2</span> FSDP1 (deprecated)</h3>
+<h3 class="anchored" data-anchor-id="sec-fsdp-config">FSDP1 (deprecated)</h3>
 <div class="callout callout-style-default callout-note callout-titled">
 <div class="callout-header d-flex align-content-center">
 <div class="callout-icon-container">
@@ -730,33 +738,29 @@ Note
 <span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> LlamaDecoderLayer</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
 </section>
-<section id="sec-sequence-parallelism" class="level2" data-number="4">
+<section id="sec-sequence-parallelism" class="level2">
-<h2 data-number="4" class="anchored" data-anchor-id="sec-sequence-parallelism"><span class="header-section-number">4</span> Sequence parallelism</h2>
+<h2 class="anchored" data-anchor-id="sec-sequence-parallelism">Sequence parallelism</h2>
 <p>We support sequence parallelism (SP) via the
 <a href="https://github.com/zhuzilin/ring-flash-attention">ring-flash-attention</a> project. This
 allows one to split up sequences across GPUs, which is useful in the event that a
 single sequence causes OOM errors during model training.</p>
 <p>See our <a href="../docs/sequence_parallelism.html">dedicated guide</a> for more information.</p>
 <section id="sec-fsdp-qlora" class="level3" data-number="4.1">
 <h3 data-number="4.1" class="anchored" data-anchor-id="sec-fsdp-qlora"><span class="header-section-number">4.1</span> FSDP + QLoRA</h3>
 <p>For combining FSDP with QLoRA, see our <a href="../docs/fsdp_qlora.html">dedicated guide</a>.</p>
 </section>
-</section>
+<section id="sec-performance" class="level2">
-<section id="sec-performance" class="level2" data-number="5">
+<h2 class="anchored" data-anchor-id="sec-performance">Performance Optimization</h2>
-<h2 data-number="5" class="anchored" data-anchor-id="sec-performance"><span class="header-section-number">5</span> Performance Optimization</h2>
+<section id="sec-liger" class="level3">
-<section id="sec-liger" class="level3" data-number="5.1">
+<h3 class="anchored" data-anchor-id="sec-liger">Liger Kernel Integration</h3>
 <h3 data-number="5.1" class="anchored" data-anchor-id="sec-liger"><span class="header-section-number">5.1</span> Liger Kernel Integration</h3>
 <p>Please see <a href="../docs/custom_integrations.html#liger">docs</a> for more info.</p>
 </section>
 </section>
-<section id="sec-troubleshooting" class="level2" data-number="6">
+<section id="sec-troubleshooting" class="level2">
-<h2 data-number="6" class="anchored" data-anchor-id="sec-troubleshooting"><span class="header-section-number">6</span> Troubleshooting</h2>
+<h2 class="anchored" data-anchor-id="sec-troubleshooting">Troubleshooting</h2>
-<section id="sec-nccl" class="level3" data-number="6.1">
+<section id="sec-nccl" class="level3">
-<h3 data-number="6.1" class="anchored" data-anchor-id="sec-nccl"><span class="header-section-number">6.1</span> NCCL Issues</h3>
+<h3 class="anchored" data-anchor-id="sec-nccl">NCCL Issues</h3>
 <p>For NCCL-related problems, see our <a href="../docs/nccl.html">NCCL troubleshooting guide</a>.</p>
 </section>
-<section id="sec-common-problems" class="level3" data-number="6.2">
+<section id="sec-common-problems" class="level3">
-<h3 data-number="6.2" class="anchored" data-anchor-id="sec-common-problems"><span class="header-section-number">6.2</span> Common Problems</h3>
+<h3 class="anchored" data-anchor-id="sec-common-problems">Common Problems</h3>
 <div class="tabset-margin-container"></div><div class="panel-tabset">
 <ul class="nav nav-tabs" role="tablist"><li class="nav-item" role="presentation"><a class="nav-link active" id="tabset-1-1-tab" data-bs-toggle="tab" data-bs-target="#tabset-1-1" role="tab" aria-controls="tabset-1-1" aria-selected="true" href="">Memory Issues</a></li><li class="nav-item" role="presentation"><a class="nav-link" id="tabset-1-2-tab" data-bs-toggle="tab" data-bs-target="#tabset-1-2" role="tab" aria-controls="tabset-1-2" aria-selected="false" href="">Training Instability</a></li></ul>
 <div class="tab-content">
@@ -1243,7 +1247,7 @@ single sequence causes OOM errors during model training.</p>
 <span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
 <span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
 <span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-depth: 3</span></span>
-<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="co">    number-sections: true</span></span>
+<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="co">    # number-sections: true</span></span>
 <span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
 <span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
 <span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a><span class="co">  enabled: false</span></span>
@@ -1253,173 +1257,181 @@ single sequence causes OOM errors during model training.</p>
 <span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a><span class="fu">## Overview {#sec-overview}</span></span>
 <span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a>Axolotl supports several methods for multi-GPU training:</span>
+<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a>When training on multiple GPUs, Axolotl supports 3 sharding/parallelism strategies. Additionally, you can layer specific optimization features on top of that strategy.</span>
 <span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>DeepSpeed (recommended)</span>
+<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a>You generally cannot combine these strategies; they are mutually exclusive.</span>
-<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>FSDP (Fully Sharded Data Parallel)</span>
+<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-21"><a href="#cb6-21" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Sequence parallelism</span>
+<span id="cb6-21"><a href="#cb6-21" aria-hidden="true" tabindex="-1"></a><span class="ss">1.  </span>**DeepSpeed**: Powerful optimization library, supports ZeRO stages 1-3.</span>
-<span id="cb6-22"><a href="#cb6-22" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>FSDP + QLoRA</span>
+<span id="cb6-22"><a href="#cb6-22" aria-hidden="true" tabindex="-1"></a><span class="ss">2.  </span>**FSDP (Fully Sharded Data Parallel)**: PyTorch's native sharding implementation (Recommended).</span>
-<span id="cb6-23"><a href="#cb6-23" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-23"><a href="#cb6-23" aria-hidden="true" tabindex="-1"></a><span class="ss">3.  </span>**DDP (Distributed Data Parallel)**: PyTorch's native parallelism implementation (Default if neither of the above are selected).</span>
-<span id="cb6-24"><a href="#cb6-24" aria-hidden="true" tabindex="-1"></a><span class="fu">## DeepSpeed {#sec-deepspeed}</span></span>
+<span id="cb6-24"><a href="#cb6-24" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-25"><a href="#cb6-25" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-25"><a href="#cb6-25" aria-hidden="true" tabindex="-1"></a>These features can often be combined with the strategies above:</span>
-<span id="cb6-26"><a href="#cb6-26" aria-hidden="true" tabindex="-1"></a><span class="fu">### Configuration {#sec-deepspeed-config}</span></span>
+<span id="cb6-26"><a href="#cb6-26" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-27"><a href="#cb6-27" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-27"><a href="#cb6-27" aria-hidden="true" tabindex="-1"></a><span class="ss">*   </span>**Sequence Parallelism**: Splits long sequences across GPUs (Compatible with DDP, DeepSpeed, and FSDP).</span>
-<span id="cb6-28"><a href="#cb6-28" aria-hidden="true" tabindex="-1"></a>Add to your YAML config:</span>
+<span id="cb6-28"><a href="#cb6-28" aria-hidden="true" tabindex="-1"></a><span class="ss">*   </span>**FSDP + QLoRA**: Combines 4-bit quantization with FSDP (Specific to FSDP).</span>
 <span id="cb6-29"><a href="#cb6-29" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-30"><a href="#cb6-30" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
+<span id="cb6-30"><a href="#cb6-30" aria-hidden="true" tabindex="-1"></a><span class="fu">## DeepSpeed {#sec-deepspeed}</span></span>
-<span id="cb6-31"><a href="#cb6-31" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> deepspeed_configs/zero1.json</span></span>
+<span id="cb6-31"><a href="#cb6-31" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-32"><a href="#cb6-32" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb6-32"><a href="#cb6-32" aria-hidden="true" tabindex="-1"></a><span class="fu">### Configuration {#sec-deepspeed-config}</span></span>
-<span id="cb6-33"><a href="#cb6-33" aria-hidden="true" tabindex="-1"></a><span class="fu">### Usage {#sec-deepspeed-usage}</span></span>
+<span id="cb6-33"><a href="#cb6-33" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-34"><a href="#cb6-34" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-34"><a href="#cb6-34" aria-hidden="true" tabindex="-1"></a>Add to your YAML config:</span>
-<span id="cb6-35"><a href="#cb6-35" aria-hidden="true" tabindex="-1"></a><span class="in">```{.bash}</span></span>
+<span id="cb6-35"><a href="#cb6-35" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-36"><a href="#cb6-36" aria-hidden="true" tabindex="-1"></a><span class="co"># Fetch deepspeed configs (if not already present)</span></span>
+<span id="cb6-36"><a href="#cb6-36" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
-<span id="cb6-37"><a href="#cb6-37" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch deepspeed_configs</span>
+<span id="cb6-37"><a href="#cb6-37" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> deepspeed_configs/zero1.json</span></span>
-<span id="cb6-38"><a href="#cb6-38" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-38"><a href="#cb6-38" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb6-39"><a href="#cb6-39" aria-hidden="true" tabindex="-1"></a><span class="co"># Passing arg via config</span></span>
+<span id="cb6-39"><a href="#cb6-39" aria-hidden="true" tabindex="-1"></a><span class="fu">### Usage {#sec-deepspeed-usage}</span></span>
-<span id="cb6-40"><a href="#cb6-40" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train config.yml</span>
+<span id="cb6-40"><a href="#cb6-40" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-41"><a href="#cb6-41" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-41"><a href="#cb6-41" aria-hidden="true" tabindex="-1"></a><span class="in">```{.bash}</span></span>
-<span id="cb6-42"><a href="#cb6-42" aria-hidden="true" tabindex="-1"></a><span class="co"># Passing arg via cli</span></span>
+<span id="cb6-42"><a href="#cb6-42" aria-hidden="true" tabindex="-1"></a><span class="co"># Fetch deepspeed configs (if not already present)</span></span>
-<span id="cb6-43"><a href="#cb6-43" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train config.yml <span class="at">--deepspeed</span> deepspeed_configs/zero1.json</span>
+<span id="cb6-43"><a href="#cb6-43" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch deepspeed_configs</span>
-<span id="cb6-44"><a href="#cb6-44" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb6-44"><a href="#cb6-44" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-45"><a href="#cb6-45" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-45"><a href="#cb6-45" aria-hidden="true" tabindex="-1"></a><span class="co"># Passing arg via config</span></span>
-<span id="cb6-46"><a href="#cb6-46" aria-hidden="true" tabindex="-1"></a><span class="fu">### ZeRO Stages {#sec-zero-stages}</span></span>
+<span id="cb6-46"><a href="#cb6-46" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train config.yml</span>
 <span id="cb6-47"><a href="#cb6-47" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-48"><a href="#cb6-48" aria-hidden="true" tabindex="-1"></a>We provide default configurations for:</span>
+<span id="cb6-48"><a href="#cb6-48" aria-hidden="true" tabindex="-1"></a><span class="co"># Passing arg via cli</span></span>
-<span id="cb6-49"><a href="#cb6-49" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-49"><a href="#cb6-49" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train config.yml <span class="at">--deepspeed</span> deepspeed_configs/zero1.json</span>
-<span id="cb6-50"><a href="#cb6-50" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 1 (<span class="in">`zero1.json`</span>)</span>
+<span id="cb6-50"><a href="#cb6-50" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb6-51"><a href="#cb6-51" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 1 with torch compile (<span class="in">`zero1_torch_compile.json`</span>)</span>
+<span id="cb6-51"><a href="#cb6-51" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-52"><a href="#cb6-52" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 2 (<span class="in">`zero2.json`</span>)</span>
+<span id="cb6-52"><a href="#cb6-52" aria-hidden="true" tabindex="-1"></a><span class="fu">### ZeRO Stages {#sec-zero-stages}</span></span>
-<span id="cb6-53"><a href="#cb6-53" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 3 (<span class="in">`zero3.json`</span>)</span>
+<span id="cb6-53"><a href="#cb6-53" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-54"><a href="#cb6-54" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 3 with bf16 (<span class="in">`zero3_bf16.json`</span>)</span>
+<span id="cb6-54"><a href="#cb6-54" aria-hidden="true" tabindex="-1"></a>We provide default configurations for:</span>
-<span id="cb6-55"><a href="#cb6-55" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 3 with bf16 and CPU offload params(<span class="in">`zero3_bf16_cpuoffload_params.json`</span>)</span>
+<span id="cb6-55"><a href="#cb6-55" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-56"><a href="#cb6-56" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 3 with bf16 and CPU offload params and optimizer (<span class="in">`zero3_bf16_cpuoffload_all.json`</span>)</span>
+<span id="cb6-56"><a href="#cb6-56" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 1 (<span class="in">`zero1.json`</span>)</span>
-<span id="cb6-57"><a href="#cb6-57" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-57"><a href="#cb6-57" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 1 with torch compile (<span class="in">`zero1_torch_compile.json`</span>)</span>
-<span id="cb6-58"><a href="#cb6-58" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip}</span>
+<span id="cb6-58"><a href="#cb6-58" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 2 (<span class="in">`zero2.json`</span>)</span>
-<span id="cb6-59"><a href="#cb6-59" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-59"><a href="#cb6-59" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 3 (<span class="in">`zero3.json`</span>)</span>
-<span id="cb6-60"><a href="#cb6-60" aria-hidden="true" tabindex="-1"></a>Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.</span>
+<span id="cb6-60"><a href="#cb6-60" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 3 with bf16 (<span class="in">`zero3_bf16.json`</span>)</span>
-<span id="cb6-61"><a href="#cb6-61" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-61"><a href="#cb6-61" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 3 with bf16 and CPU offload params(<span class="in">`zero3_bf16_cpuoffload_params.json`</span>)</span>
-<span id="cb6-62"><a href="#cb6-62" aria-hidden="true" tabindex="-1"></a>Start from Stage 1 -&gt; Stage 2 -&gt; Stage 3.</span>
+<span id="cb6-62"><a href="#cb6-62" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 3 with bf16 and CPU offload params and optimizer (<span class="in">`zero3_bf16_cpuoffload_all.json`</span>)</span>
 <span id="cb6-63"><a href="#cb6-63" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-64"><a href="#cb6-64" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb6-64"><a href="#cb6-64" aria-hidden="true" tabindex="-1"></a>::: {.callout-tip}</span>
 <span id="cb6-65"><a href="#cb6-65" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-66"><a href="#cb6-66" aria-hidden="true" tabindex="-1"></a><span class="fu">## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}</span></span>
+<span id="cb6-66"><a href="#cb6-66" aria-hidden="true" tabindex="-1"></a>Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.</span>
 <span id="cb6-67"><a href="#cb6-67" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-68"><a href="#cb6-68" aria-hidden="true" tabindex="-1"></a>::: {.callout-note}</span>
+<span id="cb6-68"><a href="#cb6-68" aria-hidden="true" tabindex="-1"></a>Start from Stage 1 -&gt; Stage 2 -&gt; Stage 3.</span>
 <span id="cb6-69"><a href="#cb6-69" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-70"><a href="#cb6-70" aria-hidden="true" tabindex="-1"></a>FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.</span>
+<span id="cb6-70"><a href="#cb6-70" aria-hidden="true" tabindex="-1"></a>:::</span>
 <span id="cb6-71"><a href="#cb6-71" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-72"><a href="#cb6-72" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb6-72"><a href="#cb6-72" aria-hidden="true" tabindex="-1"></a><span class="fu">## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}</span></span>
 <span id="cb6-73"><a href="#cb6-73" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-74"><a href="#cb6-74" aria-hidden="true" tabindex="-1"></a><span class="fu">### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}</span></span>
+<span id="cb6-74"><a href="#cb6-74" aria-hidden="true" tabindex="-1"></a>FSDP allows you to shard model parameters, gradients, and optimizer states across data parallel workers.</span>
 <span id="cb6-75"><a href="#cb6-75" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-76"><a href="#cb6-76" aria-hidden="true" tabindex="-1"></a>To migrate your config from FSDP1 to FSDP2, you must use the <span class="in">`fsdp_version`</span> top-level config field to specify the FSDP version, and</span>
+<span id="cb6-76"><a href="#cb6-76" aria-hidden="true" tabindex="-1"></a>::: {.callout-note}</span>
-<span id="cb6-77"><a href="#cb6-77" aria-hidden="true" tabindex="-1"></a>also follow the config field mapping below to update field names.</span>
+<span id="cb6-77"><a href="#cb6-77" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-78"><a href="#cb6-78" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-78"><a href="#cb6-78" aria-hidden="true" tabindex="-1"></a>FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.</span>
-<span id="cb6-79"><a href="#cb6-79" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Config mapping</span></span>
+<span id="cb6-79"><a href="#cb6-79" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-80"><a href="#cb6-80" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-80"><a href="#cb6-80" aria-hidden="true" tabindex="-1"></a>:::</span>
-<span id="cb6-81"><a href="#cb6-81" aria-hidden="true" tabindex="-1"></a>FSDP1 | FSDP2</span>
+<span id="cb6-81"><a href="#cb6-81" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-82"><a href="#cb6-82" aria-hidden="true" tabindex="-1"></a>-------- | --------</span>
+<span id="cb6-82"><a href="#cb6-82" aria-hidden="true" tabindex="-1"></a><span class="fu">### FSDP + QLoRA {#sec-fsdp-qlora}</span></span>
-<span id="cb6-83"><a href="#cb6-83" aria-hidden="true" tabindex="-1"></a>fsdp_sharding_strategy | reshard_after_forward</span>
+<span id="cb6-83"><a href="#cb6-83" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-84"><a href="#cb6-84" aria-hidden="true" tabindex="-1"></a>fsdp_backward_prefetch_policy | **REMOVED**</span>
+<span id="cb6-84"><a href="#cb6-84" aria-hidden="true" tabindex="-1"></a>For combining FSDP with QLoRA, see our <span class="co">[</span><span class="ot">dedicated guide</span><span class="co">](fsdp_qlora.qmd)</span>.</span>
-<span id="cb6-85"><a href="#cb6-85" aria-hidden="true" tabindex="-1"></a>fsdp_backward_prefetch | **REMOVED**</span>
+<span id="cb6-85"><a href="#cb6-85" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-86"><a href="#cb6-86" aria-hidden="true" tabindex="-1"></a>fsdp_forward_prefetch | **REMOVED**</span>
+<span id="cb6-86"><a href="#cb6-86" aria-hidden="true" tabindex="-1"></a><span class="fu">### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}</span></span>
-<span id="cb6-87"><a href="#cb6-87" aria-hidden="true" tabindex="-1"></a>fsdp_sync_module_states | **REMOVED**</span>
+<span id="cb6-87"><a href="#cb6-87" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-88"><a href="#cb6-88" aria-hidden="true" tabindex="-1"></a>fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading</span>
+<span id="cb6-88"><a href="#cb6-88" aria-hidden="true" tabindex="-1"></a>To migrate your config from FSDP1 to FSDP2, you must use the <span class="in">`fsdp_version`</span> top-level config field to specify the FSDP version, and</span>
-<span id="cb6-89"><a href="#cb6-89" aria-hidden="true" tabindex="-1"></a>fsdp_state_dict_type | state_dict_type</span>
+<span id="cb6-89"><a href="#cb6-89" aria-hidden="true" tabindex="-1"></a>also follow the config field mapping below to update field names.</span>
-<span id="cb6-90"><a href="#cb6-90" aria-hidden="true" tabindex="-1"></a>fsdp_use_orig_params | **REMOVED**</span>
+<span id="cb6-90"><a href="#cb6-90" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-91"><a href="#cb6-91" aria-hidden="true" tabindex="-1"></a>fsdp_activation_checkpointing | activation_checkpointing</span>
+<span id="cb6-91"><a href="#cb6-91" aria-hidden="true" tabindex="-1"></a><span class="fu">#### Config mapping</span></span>
 <span id="cb6-92"><a href="#cb6-92" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-93"><a href="#cb6-93" aria-hidden="true" tabindex="-1"></a>For more details, please see the migration guide in the <span class="co">[</span><span class="ot">torchtitan repo</span><span class="co">](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md)</span>. In Axolotl,</span>
+<span id="cb6-93"><a href="#cb6-93" aria-hidden="true" tabindex="-1"></a>FSDP1 | FSDP2</span>
-<span id="cb6-94"><a href="#cb6-94" aria-hidden="true" tabindex="-1"></a>if you were using the following FSDP1 config:</span>
+<span id="cb6-94"><a href="#cb6-94" aria-hidden="true" tabindex="-1"></a>-------- | --------</span>
-<span id="cb6-95"><a href="#cb6-95" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-95"><a href="#cb6-95" aria-hidden="true" tabindex="-1"></a>fsdp_sharding_strategy | reshard_after_forward</span>
-<span id="cb6-96"><a href="#cb6-96" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
+<span id="cb6-96"><a href="#cb6-96" aria-hidden="true" tabindex="-1"></a>fsdp_backward_prefetch_policy | **REMOVED**</span>
-<span id="cb6-97"><a href="#cb6-97" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span></span>
+<span id="cb6-97"><a href="#cb6-97" aria-hidden="true" tabindex="-1"></a>fsdp_backward_prefetch | **REMOVED**</span>
-<span id="cb6-98"><a href="#cb6-98" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
+<span id="cb6-98"><a href="#cb6-98" aria-hidden="true" tabindex="-1"></a>fsdp_forward_prefetch | **REMOVED**</span>
-<span id="cb6-99"><a href="#cb6-99" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_offload_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
+<span id="cb6-99"><a href="#cb6-99" aria-hidden="true" tabindex="-1"></a>fsdp_sync_module_states | **REMOVED**</span>
-<span id="cb6-100"><a href="#cb6-100" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb6-100"><a href="#cb6-100" aria-hidden="true" tabindex="-1"></a>fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading</span>
-<span id="cb6-101"><a href="#cb6-101" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_auto_wrap_policy</span><span class="kw">:</span><span class="at"> TRANSFORMER_BASED_WRAP</span></span>
+<span id="cb6-101"><a href="#cb6-101" aria-hidden="true" tabindex="-1"></a>fsdp_state_dict_type | state_dict_type</span>
-<span id="cb6-102"><a href="#cb6-102" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> Qwen3DecoderLayer</span></span>
+<span id="cb6-102"><a href="#cb6-102" aria-hidden="true" tabindex="-1"></a>fsdp_use_orig_params | **REMOVED**</span>
-<span id="cb6-103"><a href="#cb6-103" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_state_dict_type</span><span class="kw">:</span><span class="at"> FULL_STATE_DICT</span></span>
+<span id="cb6-103"><a href="#cb6-103" aria-hidden="true" tabindex="-1"></a>fsdp_activation_checkpointing | activation_checkpointing</span>
-<span id="cb6-104"><a href="#cb6-104" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_sharding_strategy</span><span class="kw">:</span><span class="at"> FULL_SHARD</span></span>
+<span id="cb6-104"><a href="#cb6-104" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-105"><a href="#cb6-105" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb6-105"><a href="#cb6-105" aria-hidden="true" tabindex="-1"></a>For more details, please see the migration guide in the <span class="co">[</span><span class="ot">torchtitan repo</span><span class="co">](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md)</span>. In Axolotl,</span>
-<span id="cb6-106"><a href="#cb6-106" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-106"><a href="#cb6-106" aria-hidden="true" tabindex="-1"></a>if you were using the following FSDP1 config:</span>
-<span id="cb6-107"><a href="#cb6-107" aria-hidden="true" tabindex="-1"></a>You can migrate to the following FSDP2 config:</span>
+<span id="cb6-107"><a href="#cb6-107" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-108"><a href="#cb6-108" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-108"><a href="#cb6-108" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
-<span id="cb6-109"><a href="#cb6-109" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
+<span id="cb6-109"><a href="#cb6-109" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span></span>
-<span id="cb6-110"><a href="#cb6-110" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
+<span id="cb6-110"><a href="#cb6-110" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
-<span id="cb6-111"><a href="#cb6-111" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
+<span id="cb6-111"><a href="#cb6-111" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_offload_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
-<span id="cb6-112"><a href="#cb6-112" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">offload_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
+<span id="cb6-112"><a href="#cb6-112" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
-<span id="cb6-113"><a href="#cb6-113" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb6-113"><a href="#cb6-113" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_auto_wrap_policy</span><span class="kw">:</span><span class="at"> TRANSFORMER_BASED_WRAP</span></span>
-<span id="cb6-114"><a href="#cb6-114" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">auto_wrap_policy</span><span class="kw">:</span><span class="at"> TRANSFORMER_BASED_WRAP</span></span>
+<span id="cb6-114"><a href="#cb6-114" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> Qwen3DecoderLayer</span></span>
-<span id="cb6-115"><a href="#cb6-115" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> Qwen3DecoderLayer</span></span>
+<span id="cb6-115"><a href="#cb6-115" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_state_dict_type</span><span class="kw">:</span><span class="at"> FULL_STATE_DICT</span></span>
-<span id="cb6-116"><a href="#cb6-116" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">state_dict_type</span><span class="kw">:</span><span class="at"> FULL_STATE_DICT</span></span>
+<span id="cb6-116"><a href="#cb6-116" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_sharding_strategy</span><span class="kw">:</span><span class="at"> FULL_SHARD</span></span>
-<span id="cb6-117"><a href="#cb6-117" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">reshard_after_forward</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb6-117"><a href="#cb6-117" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb6-118"><a href="#cb6-118" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb6-118"><a href="#cb6-118" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-119"><a href="#cb6-119" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-119"><a href="#cb6-119" aria-hidden="true" tabindex="-1"></a>You can migrate to the following FSDP2 config:</span>
-<span id="cb6-120"><a href="#cb6-120" aria-hidden="true" tabindex="-1"></a><span class="fu">### FSDP1 (deprecated) {#sec-fsdp-config}</span></span>
+<span id="cb6-120"><a href="#cb6-120" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-121"><a href="#cb6-121" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-121"><a href="#cb6-121" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
-<span id="cb6-122"><a href="#cb6-122" aria-hidden="true" tabindex="-1"></a>::: {.callout-note}</span>
+<span id="cb6-122"><a href="#cb6-122" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
-<span id="cb6-123"><a href="#cb6-123" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-123"><a href="#cb6-123" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
-<span id="cb6-124"><a href="#cb6-124" aria-hidden="true" tabindex="-1"></a>Using <span class="in">`fsdp`</span> to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use <span class="in">`fsdp_config`</span> as above instead.</span>
+<span id="cb6-124"><a href="#cb6-124" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">offload_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
-<span id="cb6-125"><a href="#cb6-125" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-125"><a href="#cb6-125" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
-<span id="cb6-126"><a href="#cb6-126" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb6-126"><a href="#cb6-126" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">auto_wrap_policy</span><span class="kw">:</span><span class="at"> TRANSFORMER_BASED_WRAP</span></span>
-<span id="cb6-127"><a href="#cb6-127" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-127"><a href="#cb6-127" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> Qwen3DecoderLayer</span></span>
-<span id="cb6-128"><a href="#cb6-128" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
+<span id="cb6-128"><a href="#cb6-128" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">state_dict_type</span><span class="kw">:</span><span class="at"> FULL_STATE_DICT</span></span>
-<span id="cb6-129"><a href="#cb6-129" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span></span>
+<span id="cb6-129"><a href="#cb6-129" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">reshard_after_forward</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
-<span id="cb6-130"><a href="#cb6-130" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span>full_shard</span>
+<span id="cb6-130"><a href="#cb6-130" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb6-131"><a href="#cb6-131" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span>auto_wrap</span>
+<span id="cb6-131"><a href="#cb6-131" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-132"><a href="#cb6-132" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
+<span id="cb6-132"><a href="#cb6-132" aria-hidden="true" tabindex="-1"></a><span class="fu">### FSDP1 (deprecated) {#sec-fsdp-config}</span></span>
-<span id="cb6-133"><a href="#cb6-133" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_offload_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb6-133"><a href="#cb6-133" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-134"><a href="#cb6-134" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_state_dict_type</span><span class="kw">:</span><span class="at"> FULL_STATE_DICT</span></span>
+<span id="cb6-134"><a href="#cb6-134" aria-hidden="true" tabindex="-1"></a>::: {.callout-note}</span>
-<span id="cb6-135"><a href="#cb6-135" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> LlamaDecoderLayer</span></span>
+<span id="cb6-135"><a href="#cb6-135" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-136"><a href="#cb6-136" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb6-136"><a href="#cb6-136" aria-hidden="true" tabindex="-1"></a>Using <span class="in">`fsdp`</span> to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use <span class="in">`fsdp_config`</span> as above instead.</span>
 <span id="cb6-137"><a href="#cb6-137" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-138"><a href="#cb6-138" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-138"><a href="#cb6-138" aria-hidden="true" tabindex="-1"></a>:::</span>
-<span id="cb6-139"><a href="#cb6-139" aria-hidden="true" tabindex="-1"></a><span class="fu">## Sequence parallelism {#sec-sequence-parallelism}</span></span>
+<span id="cb6-139"><a href="#cb6-139" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-140"><a href="#cb6-140" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-140"><a href="#cb6-140" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
-<span id="cb6-141"><a href="#cb6-141" aria-hidden="true" tabindex="-1"></a>We support sequence parallelism (SP) via the</span>
+<span id="cb6-141"><a href="#cb6-141" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span></span>
-<span id="cb6-142"><a href="#cb6-142" aria-hidden="true" tabindex="-1"></a><span class="co">[</span><span class="ot">ring-flash-attention</span><span class="co">](https://github.com/zhuzilin/ring-flash-attention)</span> project. This</span>
+<span id="cb6-142"><a href="#cb6-142" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span>full_shard</span>
-<span id="cb6-143"><a href="#cb6-143" aria-hidden="true" tabindex="-1"></a>allows one to split up sequences across GPUs, which is useful in the event that a</span>
+<span id="cb6-143"><a href="#cb6-143" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span>auto_wrap</span>
-<span id="cb6-144"><a href="#cb6-144" aria-hidden="true" tabindex="-1"></a>single sequence causes OOM errors during model training.</span>
+<span id="cb6-144"><a href="#cb6-144" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
-<span id="cb6-145"><a href="#cb6-145" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-145"><a href="#cb6-145" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_offload_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
-<span id="cb6-146"><a href="#cb6-146" aria-hidden="true" tabindex="-1"></a>See our <span class="co">[</span><span class="ot">dedicated guide</span><span class="co">](sequence_parallelism.qmd)</span> for more information.</span>
+<span id="cb6-146"><a href="#cb6-146" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_state_dict_type</span><span class="kw">:</span><span class="at"> FULL_STATE_DICT</span></span>
-<span id="cb6-147"><a href="#cb6-147" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-147"><a href="#cb6-147" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> LlamaDecoderLayer</span></span>
-<span id="cb6-148"><a href="#cb6-148" aria-hidden="true" tabindex="-1"></a><span class="fu">### FSDP + QLoRA {#sec-fsdp-qlora}</span></span>
+<span id="cb6-148"><a href="#cb6-148" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
 <span id="cb6-149"><a href="#cb6-149" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-150"><a href="#cb6-150" aria-hidden="true" tabindex="-1"></a>For combining FSDP with QLoRA, see our <span class="co">[</span><span class="ot">dedicated guide</span><span class="co">](fsdp_qlora.qmd)</span>.</span>
+<span id="cb6-150"><a href="#cb6-150" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-151"><a href="#cb6-151" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-151"><a href="#cb6-151" aria-hidden="true" tabindex="-1"></a><span class="fu">## Sequence parallelism {#sec-sequence-parallelism}</span></span>
-<span id="cb6-152"><a href="#cb6-152" aria-hidden="true" tabindex="-1"></a><span class="fu">## Performance Optimization {#sec-performance}</span></span>
+<span id="cb6-152"><a href="#cb6-152" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-153"><a href="#cb6-153" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-153"><a href="#cb6-153" aria-hidden="true" tabindex="-1"></a>We support sequence parallelism (SP) via the</span>
-<span id="cb6-154"><a href="#cb6-154" aria-hidden="true" tabindex="-1"></a><span class="fu">### Liger Kernel Integration {#sec-liger}</span></span>
+<span id="cb6-154"><a href="#cb6-154" aria-hidden="true" tabindex="-1"></a><span class="co">[</span><span class="ot">ring-flash-attention</span><span class="co">](https://github.com/zhuzilin/ring-flash-attention)</span> project. This</span>
-<span id="cb6-155"><a href="#cb6-155" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-155"><a href="#cb6-155" aria-hidden="true" tabindex="-1"></a>allows one to split up sequences across GPUs, which is useful in the event that a</span>
-<span id="cb6-156"><a href="#cb6-156" aria-hidden="true" tabindex="-1"></a>Please see <span class="co">[</span><span class="ot">docs</span><span class="co">](custom_integrations.qmd#liger)</span> for more info.</span>
+<span id="cb6-156"><a href="#cb6-156" aria-hidden="true" tabindex="-1"></a>single sequence causes OOM errors during model training.</span>
 <span id="cb6-157"><a href="#cb6-157" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-158"><a href="#cb6-158" aria-hidden="true" tabindex="-1"></a><span class="fu">## Troubleshooting {#sec-troubleshooting}</span></span>
+<span id="cb6-158"><a href="#cb6-158" aria-hidden="true" tabindex="-1"></a>See our <span class="co">[</span><span class="ot">dedicated guide</span><span class="co">](sequence_parallelism.qmd)</span> for more information.</span>
 <span id="cb6-159"><a href="#cb6-159" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-160"><a href="#cb6-160" aria-hidden="true" tabindex="-1"></a><span class="fu">### NCCL Issues {#sec-nccl}</span></span>
+<span id="cb6-160"><a href="#cb6-160" aria-hidden="true" tabindex="-1"></a><span class="fu">## Performance Optimization {#sec-performance}</span></span>
 <span id="cb6-161"><a href="#cb6-161" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-162"><a href="#cb6-162" aria-hidden="true" tabindex="-1"></a>For NCCL-related problems, see our <span class="co">[</span><span class="ot">NCCL troubleshooting guide</span><span class="co">](nccl.qmd)</span>.</span>
+<span id="cb6-162"><a href="#cb6-162" aria-hidden="true" tabindex="-1"></a><span class="fu">### Liger Kernel Integration {#sec-liger}</span></span>
 <span id="cb6-163"><a href="#cb6-163" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-164"><a href="#cb6-164" aria-hidden="true" tabindex="-1"></a><span class="fu">### Common Problems {#sec-common-problems}</span></span>
+<span id="cb6-164"><a href="#cb6-164" aria-hidden="true" tabindex="-1"></a>Please see <span class="co">[</span><span class="ot">docs</span><span class="co">](custom_integrations.qmd#liger)</span> for more info.</span>
 <span id="cb6-165"><a href="#cb6-165" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-166"><a href="#cb6-166" aria-hidden="true" tabindex="-1"></a>::: {.panel-tabset}</span>
+<span id="cb6-166"><a href="#cb6-166" aria-hidden="true" tabindex="-1"></a><span class="fu">## Troubleshooting {#sec-troubleshooting}</span></span>
 <span id="cb6-167"><a href="#cb6-167" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-168"><a href="#cb6-168" aria-hidden="true" tabindex="-1"></a><span class="fu">## Memory Issues</span></span>
+<span id="cb6-168"><a href="#cb6-168" aria-hidden="true" tabindex="-1"></a><span class="fu">### NCCL Issues {#sec-nccl}</span></span>
 <span id="cb6-169"><a href="#cb6-169" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-170"><a href="#cb6-170" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Reduce <span class="in">`micro_batch_size`</span></span>
+<span id="cb6-170"><a href="#cb6-170" aria-hidden="true" tabindex="-1"></a>For NCCL-related problems, see our <span class="co">[</span><span class="ot">NCCL troubleshooting guide</span><span class="co">](nccl.qmd)</span>.</span>
-<span id="cb6-171"><a href="#cb6-171" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Reduce <span class="in">`eval_batch_size`</span></span>
+<span id="cb6-171"><a href="#cb6-171" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-172"><a href="#cb6-172" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Adjust <span class="in">`gradient_accumulation_steps`</span></span>
+<span id="cb6-172"><a href="#cb6-172" aria-hidden="true" tabindex="-1"></a><span class="fu">### Common Problems {#sec-common-problems}</span></span>
-<span id="cb6-173"><a href="#cb6-173" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Consider using a higher ZeRO stage</span>
+<span id="cb6-173"><a href="#cb6-173" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-174"><a href="#cb6-174" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-174"><a href="#cb6-174" aria-hidden="true" tabindex="-1"></a>::: {.panel-tabset}</span>
-<span id="cb6-175"><a href="#cb6-175" aria-hidden="true" tabindex="-1"></a><span class="fu">## Training Instability</span></span>
+<span id="cb6-175"><a href="#cb6-175" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-176"><a href="#cb6-176" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-176"><a href="#cb6-176" aria-hidden="true" tabindex="-1"></a><span class="fu">## Memory Issues</span></span>
-<span id="cb6-177"><a href="#cb6-177" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Start with DeepSpeed ZeRO-2</span>
+<span id="cb6-177"><a href="#cb6-177" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-178"><a href="#cb6-178" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Monitor loss values</span>
+<span id="cb6-178"><a href="#cb6-178" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Reduce <span class="in">`micro_batch_size`</span></span>
-<span id="cb6-179"><a href="#cb6-179" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Check learning rates</span>
+<span id="cb6-179"><a href="#cb6-179" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Reduce <span class="in">`eval_batch_size`</span></span>
-<span id="cb6-180"><a href="#cb6-180" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-180"><a href="#cb6-180" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Adjust <span class="in">`gradient_accumulation_steps`</span></span>
-<span id="cb6-181"><a href="#cb6-181" aria-hidden="true" tabindex="-1"></a>:::</span>
+<span id="cb6-181"><a href="#cb6-181" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Consider using a higher ZeRO stage</span>
 <span id="cb6-182"><a href="#cb6-182" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-183"><a href="#cb6-183" aria-hidden="true" tabindex="-1"></a>For more detailed troubleshooting, see our <span class="co">[</span><span class="ot">debugging guide</span><span class="co">](debugging.qmd)</span>.</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></div>
+<span id="cb6-183"><a href="#cb6-183" aria-hidden="true" tabindex="-1"></a><span class="fu">## Training Instability</span></span>
 <span id="cb6-184"><a href="#cb6-184" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb6-185"><a href="#cb6-185" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Start with DeepSpeed ZeRO-2</span>
 <span id="cb6-186"><a href="#cb6-186" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Monitor loss values</span>
 <span id="cb6-187"><a href="#cb6-187" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Check learning rates</span>
 <span id="cb6-188"><a href="#cb6-188" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb6-189"><a href="#cb6-189" aria-hidden="true" tabindex="-1"></a>:::</span>
 <span id="cb6-190"><a href="#cb6-190" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb6-191"><a href="#cb6-191" aria-hidden="true" tabindex="-1"></a>For more detailed troubleshooting, see our <span class="co">[</span><span class="ot">debugging guide</span><span class="co">](debugging.qmd)</span>.</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></div>
 </div></div></div></div></div>
 </div> <!-- /content -->
--- a/examples/colab-notebooks/colab-axolotl-example.html
+++ b/examples/colab-notebooks/colab-axolotl-example.html
@@ -567,7 +567,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <div class="code-copy-outer-scaffold"><div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>capture</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="co"># This step can take ~5-10 minutes to install dependencies</span></span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>pip install <span class="op">--</span>no<span class="op">-</span>build<span class="op">-</span>isolation axolotl[flash<span class="op">-</span>attn]<span class="op">&gt;=</span><span class="fl">0.9.1</span></span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>pip install <span class="st">"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>pip install <span class="st">"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@5eff953"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </div>
 <section id="demo-talk-like-a-pirate" class="level2">
 <h2 class="anchored" data-anchor-id="demo-talk-like-a-pirate">Demo: Talk Like a Pirate</h2>
--- a/index.html
+++ b/index.html
@@ -564,6 +564,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <section id="latest-updates" class="level2">
 <h2 class="anchored" data-anchor-id="latest-updates">🎉 Latest Updates</h2>
 <ul>
 <li>2025/11: Axolotl now includes support for <a href="https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/olmo3">Olmo3</a>.</li>
 <li>2025/10: New model support has been added in Axolotl for: <a href="https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/qwen3-next">Qwen3 Next</a>, <a href="https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl">Qwen2.5-vl, Qwen3-vl</a>, <a href="https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3">Qwen3, Qwen3MoE</a>, <a href="https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/granite4">Granite 4</a>, <a href="https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/hunyuan">HunYuan</a>, <a href="https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral#vision">Magistral 2509</a>, <a href="https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/apertus">Apertus</a>, and <a href="https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/seed-oss">Seed-OSS</a>.</li>
 <li>2025/09: Axolotl now has text diffusion training. Read more <a href="https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion">here</a>.</li>
 <li>2025/08: QAT has been updated to include NVFP4 support. See <a href="https://github.com/axolotl-ai-cloud/axolotl/pull/3107">PR</a>.</li>
--- a/search.json
+++ b/search.json
@@ -501,8 +501,8 @@
    "objectID": "docs/multi-gpu.html#sec-overview",
    "href": "docs/multi-gpu.html#sec-overview",
    "title": "Multi-GPU",
-    "section": "1 Overview",
+    "section": "Overview",
-    "text": "1 Overview\nAxolotl supports several methods for multi-GPU training:\n\nDeepSpeed (recommended)\nFSDP (Fully Sharded Data Parallel)\nSequence parallelism\nFSDP + QLoRA",
+    "text": "Overview\nWhen training on multiple GPUs, Axolotl supports 3 sharding/parallelism strategies. Additionally, you can layer specific optimization features on top of that strategy.\nYou generally cannot combine these strategies; they are mutually exclusive.\n\nDeepSpeed: Powerful optimization library, supports ZeRO stages 1-3.\nFSDP (Fully Sharded Data Parallel): PyTorch’s native sharding implementation (Recommended).\nDDP (Distributed Data Parallel): PyTorch’s native parallelism implementation (Default if neither of the above are selected).\n\nThese features can often be combined with the strategies above:\n\nSequence Parallelism: Splits long sequences across GPUs (Compatible with DDP, DeepSpeed, and FSDP).\nFSDP + QLoRA: Combines 4-bit quantization with FSDP (Specific to FSDP).",
    "crumbs": [
      "Deployments",
      "Multi-GPU"
@@ -512,8 +512,8 @@
    "objectID": "docs/multi-gpu.html#sec-deepspeed",
    "href": "docs/multi-gpu.html#sec-deepspeed",
    "title": "Multi-GPU",
-    "section": "2 DeepSpeed",
+    "section": "DeepSpeed",
-    "text": "2 DeepSpeed\n\n2.1 Configuration\nAdd to your YAML config:\ndeepspeed: deepspeed_configs/zero1.json\n\n\n2.2 Usage\n# Fetch deepspeed configs (if not already present)\naxolotl fetch deepspeed_configs\n\n# Passing arg via config\naxolotl train config.yml\n\n# Passing arg via cli\naxolotl train config.yml --deepspeed deepspeed_configs/zero1.json\n\n\n2.3 ZeRO Stages\nWe provide default configurations for:\n\nZeRO Stage 1 (zero1.json)\nZeRO Stage 1 with torch compile (zero1_torch_compile.json)\nZeRO Stage 2 (zero2.json)\nZeRO Stage 3 (zero3.json)\nZeRO Stage 3 with bf16 (zero3_bf16.json)\nZeRO Stage 3 with bf16 and CPU offload params(zero3_bf16_cpuoffload_params.json)\nZeRO Stage 3 with bf16 and CPU offload params and optimizer (zero3_bf16_cpuoffload_all.json)\n\n\n\n\n\n\n\nTip\n\n\n\nChoose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.\nStart from Stage 1 -&gt; Stage 2 -&gt; Stage 3.",
+    "text": "DeepSpeed\n\nConfiguration\nAdd to your YAML config:\ndeepspeed: deepspeed_configs/zero1.json\n\n\nUsage\n# Fetch deepspeed configs (if not already present)\naxolotl fetch deepspeed_configs\n\n# Passing arg via config\naxolotl train config.yml\n\n# Passing arg via cli\naxolotl train config.yml --deepspeed deepspeed_configs/zero1.json\n\n\nZeRO Stages\nWe provide default configurations for:\n\nZeRO Stage 1 (zero1.json)\nZeRO Stage 1 with torch compile (zero1_torch_compile.json)\nZeRO Stage 2 (zero2.json)\nZeRO Stage 3 (zero3.json)\nZeRO Stage 3 with bf16 (zero3_bf16.json)\nZeRO Stage 3 with bf16 and CPU offload params(zero3_bf16_cpuoffload_params.json)\nZeRO Stage 3 with bf16 and CPU offload params and optimizer (zero3_bf16_cpuoffload_all.json)\n\n\n\n\n\n\n\nTip\n\n\n\nChoose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.\nStart from Stage 1 -&gt; Stage 2 -&gt; Stage 3.",
    "crumbs": [
      "Deployments",
      "Multi-GPU"
@@ -523,8 +523,8 @@
    "objectID": "docs/multi-gpu.html#sec-fsdp",
    "href": "docs/multi-gpu.html#sec-fsdp",
    "title": "Multi-GPU",
-    "section": "3 Fully Sharded Data Parallel (FSDP)",
+    "section": "Fully Sharded Data Parallel (FSDP)",
-    "text": "3 Fully Sharded Data Parallel (FSDP)\n\n\n\n\n\n\nNote\n\n\n\nFSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.\n\n\n\n3.1 Migrating from FSDP1 to FSDP2\nTo migrate your config from FSDP1 to FSDP2, you must use the fsdp_version top-level config field to specify the FSDP version, and\nalso follow the config field mapping below to update field names.\n\n3.1.1 Config mapping\n\n\n\nFSDP1\nFSDP2\n\n\n\n\nfsdp_sharding_strategy\nreshard_after_forward\n\n\nfsdp_backward_prefetch_policy\nREMOVED\n\n\nfsdp_backward_prefetch\nREMOVED\n\n\nfsdp_forward_prefetch\nREMOVED\n\n\nfsdp_sync_module_states\nREMOVED\n\n\nfsdp_cpu_ram_efficient_loading\ncpu_ram_efficient_loading\n\n\nfsdp_state_dict_type\nstate_dict_type\n\n\nfsdp_use_orig_params\nREMOVED\n\n\nfsdp_activation_checkpointing\nactivation_checkpointing\n\n\n\nFor more details, please see the migration guide in the torchtitan repo. In Axolotl,\nif you were using the following FSDP1 config:\nfsdp_version: 1\nfsdp_config:\n  fsdp_offload_params: false\n  fsdp_cpu_ram_efficient_loading: true\n  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_sharding_strategy: FULL_SHARD\nYou can migrate to the following FSDP2 config:\nfsdp_version: 2\nfsdp_config:\n  offload_params: false\n  cpu_ram_efficient_loading: true\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  state_dict_type: FULL_STATE_DICT\n  reshard_after_forward: true\n\n\n\n3.2 FSDP1 (deprecated)\n\n\n\n\n\n\nNote\n\n\n\nUsing fsdp to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use fsdp_config as above instead.\n\n\nfsdp:\n  - full_shard\n  - auto_wrap\nfsdp_config:\n  fsdp_offload_params: true\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer",
+    "text": "Fully Sharded Data Parallel (FSDP)\nFSDP allows you to shard model parameters, gradients, and optimizer states across data parallel workers.\n\n\n\n\n\n\nNote\n\n\n\nFSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.\n\n\n\nFSDP + QLoRA\nFor combining FSDP with QLoRA, see our dedicated guide.\n\n\nMigrating from FSDP1 to FSDP2\nTo migrate your config from FSDP1 to FSDP2, you must use the fsdp_version top-level config field to specify the FSDP version, and\nalso follow the config field mapping below to update field names.\n\nConfig mapping\n\n\n\nFSDP1\nFSDP2\n\n\n\n\nfsdp_sharding_strategy\nreshard_after_forward\n\n\nfsdp_backward_prefetch_policy\nREMOVED\n\n\nfsdp_backward_prefetch\nREMOVED\n\n\nfsdp_forward_prefetch\nREMOVED\n\n\nfsdp_sync_module_states\nREMOVED\n\n\nfsdp_cpu_ram_efficient_loading\ncpu_ram_efficient_loading\n\n\nfsdp_state_dict_type\nstate_dict_type\n\n\nfsdp_use_orig_params\nREMOVED\n\n\nfsdp_activation_checkpointing\nactivation_checkpointing\n\n\n\nFor more details, please see the migration guide in the torchtitan repo. In Axolotl,\nif you were using the following FSDP1 config:\nfsdp_version: 1\nfsdp_config:\n  fsdp_offload_params: false\n  fsdp_cpu_ram_efficient_loading: true\n  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_sharding_strategy: FULL_SHARD\nYou can migrate to the following FSDP2 config:\nfsdp_version: 2\nfsdp_config:\n  offload_params: false\n  cpu_ram_efficient_loading: true\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  state_dict_type: FULL_STATE_DICT\n  reshard_after_forward: true\n\n\n\nFSDP1 (deprecated)\n\n\n\n\n\n\nNote\n\n\n\nUsing fsdp to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use fsdp_config as above instead.\n\n\nfsdp:\n  - full_shard\n  - auto_wrap\nfsdp_config:\n  fsdp_offload_params: true\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer",
    "crumbs": [
      "Deployments",
      "Multi-GPU"
@@ -534,8 +534,8 @@
    "objectID": "docs/multi-gpu.html#sec-sequence-parallelism",
    "href": "docs/multi-gpu.html#sec-sequence-parallelism",
    "title": "Multi-GPU",
-    "section": "4 Sequence parallelism",
+    "section": "Sequence parallelism",
-    "text": "4 Sequence parallelism\nWe support sequence parallelism (SP) via the\nring-flash-attention project. This\nallows one to split up sequences across GPUs, which is useful in the event that a\nsingle sequence causes OOM errors during model training.\nSee our dedicated guide for more information.\n\n4.1 FSDP + QLoRA\nFor combining FSDP with QLoRA, see our dedicated guide.",
+    "text": "Sequence parallelism\nWe support sequence parallelism (SP) via the\nring-flash-attention project. This\nallows one to split up sequences across GPUs, which is useful in the event that a\nsingle sequence causes OOM errors during model training.\nSee our dedicated guide for more information.",
    "crumbs": [
      "Deployments",
      "Multi-GPU"
@@ -545,8 +545,8 @@
    "objectID": "docs/multi-gpu.html#sec-performance",
    "href": "docs/multi-gpu.html#sec-performance",
    "title": "Multi-GPU",
-    "section": "5 Performance Optimization",
+    "section": "Performance Optimization",
-    "text": "5 Performance Optimization\n\n5.1 Liger Kernel Integration\nPlease see docs for more info.",
+    "text": "Performance Optimization\n\nLiger Kernel Integration\nPlease see docs for more info.",
    "crumbs": [
      "Deployments",
      "Multi-GPU"
@@ -556,8 +556,8 @@
    "objectID": "docs/multi-gpu.html#sec-troubleshooting",
    "href": "docs/multi-gpu.html#sec-troubleshooting",
    "title": "Multi-GPU",
-    "section": "6 Troubleshooting",
+    "section": "Troubleshooting",
-    "text": "6 Troubleshooting\n\n6.1 NCCL Issues\nFor NCCL-related problems, see our NCCL troubleshooting guide.\n\n\n6.2 Common Problems\n\nMemory IssuesTraining Instability\n\n\n\nReduce micro_batch_size\nReduce eval_batch_size\nAdjust gradient_accumulation_steps\nConsider using a higher ZeRO stage\n\n\n\n\nStart with DeepSpeed ZeRO-2\nMonitor loss values\nCheck learning rates\n\n\n\n\nFor more detailed troubleshooting, see our debugging guide.",
+    "text": "Troubleshooting\n\nNCCL Issues\nFor NCCL-related problems, see our NCCL troubleshooting guide.\n\n\nCommon Problems\n\nMemory IssuesTraining Instability\n\n\n\nReduce micro_batch_size\nReduce eval_batch_size\nAdjust gradient_accumulation_steps\nConsider using a higher ZeRO stage\n\n\n\n\nStart with DeepSpeed ZeRO-2\nMonitor loss values\nCheck learning rates\n\n\n\n\nFor more detailed troubleshooting, see our debugging guide.",
    "crumbs": [
      "Deployments",
      "Multi-GPU"
@@ -1910,7 +1910,7 @@
    "href": "docs/custom_integrations.html#cut-cross-entropy",
    "title": "Custom Integrations",
    "section": "Cut Cross Entropy",
-    "text": "Cut Cross Entropy\nCut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.\nSee https://github.com/apple/ml-cross-entropy\n\nRequirements\n\nPyTorch 2.4.0 or higher\n\n\n\nInstallation\nRun the following command to install cut_cross_entropy[transformers] if you don’t have it already.\n\nIf you are in dev environment\n\npython scripts/cutcrossentropy_install.py | sh\n\nIf you are installing from pip\n\npip3 uninstall -y cut-cross-entropy && pip3 install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec\"\n\n\nUsage\nplugins:\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n\nSupported Models\n\napertus\narcee\ncohere\ncohere2\ndeepseek_v3\ngemma\ngemma2\ngemma3\ngemma3_text\ngemma3n\ngemma3n_text\nglm\nglm4\nglm4_moe\nglm4v\nglm4v_moe\ngpt_oss\ngranite\ngranitemoe\ngranitemoeshared\ngranitemoehybrid\nhunyuan_v1_dense\nhunyuan_v1_moe\nlfm2\nlfm2_moe\nlfm2_vl\nllama\nllama4\nllama4_text\nllava\nmistral\nmistral3\nmixtral\nmllama\nphi\nphi3\nphi4_multimodal\nqwen2\nqwen2_vl\nqwen2_moe\nqwen2_5_vl\nqwen3\nqwen3_moe\nqwen3_vl\nqwen3_vl_moe\nqwen3_next\nsmollm3\nseed_oss\nvoxtral\n\n\n\nCitation\n@article{wijmans2024cut,\n  author       = {Erik Wijmans and\n                  Brody Huval and\n                  Alexander Hertzberg and\n                  Vladlen Koltun and\n                  Philipp Kr\\\"ahenb\\\"uhl},\n  title        = {Cut Your Losses in Large-Vocabulary Language Models},\n  journal      = {arXiv},\n  year         = {2024},\n  url          = {https://arxiv.org/abs/2411.09009},\n}\nPlease see reference here",
+    "text": "Cut Cross Entropy\nCut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.\nSee https://github.com/apple/ml-cross-entropy\n\nRequirements\n\nPyTorch 2.4.0 or higher\n\n\n\nInstallation\nRun the following command to install cut_cross_entropy[transformers] if you don’t have it already.\n\nIf you are in dev environment\n\npython scripts/cutcrossentropy_install.py | sh\n\nIf you are installing from pip\n\npip3 uninstall -y cut-cross-entropy && pip3 install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@5eff953\"\n\n\nUsage\nplugins:\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n\nSupported Models\n\napertus\narcee\ncohere\ncohere2\ndeepseek_v3\ngemma\ngemma2\ngemma3\ngemma3_text\ngemma3n\ngemma3n_text\nglm\nglm4\nglm4_moe\nglm4v\nglm4v_moe\ngpt_oss\ngranite\ngranitemoe\ngranitemoeshared\ngranitemoehybrid\nhunyuan_v1_dense\nhunyuan_v1_moe\nlfm2\nlfm2_moe\nlfm2_vl\nllama\nllama4\nllama4_text\nllava\nmistral\nmistral3\nmixtral\nmllama\nolmo\nolmo2\nolmo3\nphi\nphi3\nphi4_multimodal\nqwen2\nqwen2_vl\nqwen2_moe\nqwen2_5_vl\nqwen3\nqwen3_moe\nqwen3_vl\nqwen3_vl_moe\nqwen3_next\nsmollm3\nseed_oss\nvoxtral\n\n\n\nCitation\n@article{wijmans2024cut,\n  author       = {Erik Wijmans and\n                  Brody Huval and\n                  Alexander Hertzberg and\n                  Vladlen Koltun and\n                  Philipp Kr\\\"ahenb\\\"uhl},\n  title        = {Cut Your Losses in Large-Vocabulary Language Models},\n  journal      = {arXiv},\n  year         = {2024},\n  url          = {https://arxiv.org/abs/2411.09009},\n}\nPlease see reference here",
    "crumbs": [
      "Advanced Features",
      "Custom Integrations"
@@ -2030,7 +2030,7 @@
    "href": "index.html#latest-updates",
    "title": "Axolotl",
    "section": "🎉 Latest Updates",
-    "text": "🎉 Latest Updates\n\n2025/10: New model support has been added in Axolotl for: Qwen3 Next, Qwen2.5-vl, Qwen3-vl, Qwen3, Qwen3MoE, Granite 4, HunYuan, Magistral 2509, Apertus, and Seed-OSS.\n2025/09: Axolotl now has text diffusion training. Read more here.\n2025/08: QAT has been updated to include NVFP4 support. See PR.\n2025/07:\n\nND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the blog post for more info.\nAxolotl adds more models: GPT-OSS, Gemma 3n, Liquid Foundation Model 2 (LFM2), and Arcee Foundation Models (AFM).\nFP8 finetuning with fp8 gather op is now possible in Axolotl via torchao. Get started here!\nVoxtral, Magistral 1.1, and Devstral with mistral-common tokenizer support has been integrated in Axolotl!\nTiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See examples for using ALST with Axolotl!\n\n2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the docs to learn more!\n\n\n\nExpand older updates\n\n\n2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the blog and docs to learn how to scale your context length when fine-tuning.\n2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See examples to start training your own Magistral models with Axolotl!\n2025/04: Llama 4 support has been added in Axolotl. See examples to start training your own Llama 4 models with Axolotl’s linearized version!\n2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the docs to fine-tune your own!\n2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the docs to give it a try.\n2025/02: Axolotl has added GRPO support. Dive into our blog and GRPO example and have some fun!\n2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See docs.",
+    "text": "🎉 Latest Updates\n\n2025/11: Axolotl now includes support for Olmo3.\n2025/10: New model support has been added in Axolotl for: Qwen3 Next, Qwen2.5-vl, Qwen3-vl, Qwen3, Qwen3MoE, Granite 4, HunYuan, Magistral 2509, Apertus, and Seed-OSS.\n2025/09: Axolotl now has text diffusion training. Read more here.\n2025/08: QAT has been updated to include NVFP4 support. See PR.\n2025/07:\n\nND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the blog post for more info.\nAxolotl adds more models: GPT-OSS, Gemma 3n, Liquid Foundation Model 2 (LFM2), and Arcee Foundation Models (AFM).\nFP8 finetuning with fp8 gather op is now possible in Axolotl via torchao. Get started here!\nVoxtral, Magistral 1.1, and Devstral with mistral-common tokenizer support has been integrated in Axolotl!\nTiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See examples for using ALST with Axolotl!\n\n2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the docs to learn more!\n\n\n\nExpand older updates\n\n\n2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the blog and docs to learn how to scale your context length when fine-tuning.\n2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See examples to start training your own Magistral models with Axolotl!\n2025/04: Llama 4 support has been added in Axolotl. See examples to start training your own Llama 4 models with Axolotl’s linearized version!\n2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the docs to fine-tune your own!\n2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the docs to give it a try.\n2025/02: Axolotl has added GRPO support. Dive into our blog and GRPO example and have some fun!\n2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See docs.",
    "crumbs": [
      "Home"
    ]
--- a/sitemap.xml
+++ b/sitemap.xml