Built site for gh-pages
This commit is contained in:
@@ -7,7 +7,7 @@
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
||||
|
||||
|
||||
<title>Multi-GPU Training Guide – Axolotl</title>
|
||||
<title>Multi-GPU – Axolotl</title>
|
||||
<style>
|
||||
code{white-space: pre-wrap;}
|
||||
span.smallcaps{font-variant: small-caps;}
|
||||
@@ -69,10 +69,10 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
|
||||
<script src="../site_libs/quarto-html/anchor.min.js"></script>
|
||||
<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
|
||||
<link href="../site_libs/quarto-html/quarto-syntax-highlighting-2f5df379a58b258e96c21c0638c20c03.css" rel="stylesheet" id="quarto-text-highlighting-styles">
|
||||
<link href="../site_libs/quarto-html/quarto-syntax-highlighting-dark-b53751a350365c71b6c909e95f209ed1.css" rel="stylesheet" id="quarto-text-highlighting-styles">
|
||||
<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
|
||||
<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
|
||||
<link href="../site_libs/bootstrap/bootstrap-141b2cdb37a94fcfd6825c1581ff795f.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="light">
|
||||
<link href="../site_libs/bootstrap/bootstrap-0cda210ced8960466d2ee7bf22d15016.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
|
||||
<script id="quarto-search-options" type="application/json">{
|
||||
"location": "navbar",
|
||||
"copy-button": false,
|
||||
@@ -112,8 +112,8 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<nav class="navbar navbar-expand " data-bs-theme="dark">
|
||||
<div class="navbar-container container-fluid">
|
||||
<div class="navbar-brand-container mx-auto">
|
||||
<a class="navbar-brand" href="../index.html">
|
||||
<span class="navbar-title">Axolotl</span>
|
||||
<a href="../index.html" class="navbar-brand navbar-brand-logo">
|
||||
<img src="../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo">
|
||||
</a>
|
||||
</div>
|
||||
<div class="quarto-navbar-tools tools-wide tools-end">
|
||||
@@ -129,7 +129,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
|
||||
<i class="bi bi-layout-text-sidebar-reverse"></i>
|
||||
</button>
|
||||
<nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/getting-started.html">How-To Guides</a></li><li class="breadcrumb-item"><a href="../docs/multi-gpu.html">Multi-GPU Training Guide</a></li></ol></nav>
|
||||
<nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/multi-gpu.html">Deployments</a></li><li class="breadcrumb-item"><a href="../docs/multi-gpu.html">Multi-GPU</a></li></ol></nav>
|
||||
<a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
|
||||
</a>
|
||||
</div>
|
||||
@@ -150,7 +150,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<li class="sidebar-item sidebar-item-section">
|
||||
<div class="sidebar-item-container">
|
||||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
|
||||
<span class="menu-text">How-To Guides</span></a>
|
||||
<span class="menu-text">Getting Started</span></a>
|
||||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||||
<i class="bi bi-chevron-right ms-2"></i>
|
||||
</a>
|
||||
@@ -159,91 +159,25 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/getting-started.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Getting Started with Axolotl</span></a>
|
||||
<span class="menu-text">Quickstart</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/installation.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Installation Guide</span></a>
|
||||
<span class="menu-text">Installation</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/debugging.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Debugging</span></a>
|
||||
<a href="../docs/cli.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">CLI Reference</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/inference.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Inference Guide</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/multipack.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Multipack (Sample Packing)</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">FDSP + QLoRA</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/input_output.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Template-free prompt construction</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/rlhf.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">RLHF (Beta)</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/nccl.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">NCCL</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/mac.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Mac M-series</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/multi-gpu.html" class="sidebar-item-text sidebar-link active">
|
||||
<span class="menu-text">Multi-GPU Training Guide</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/multi-node.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Multi Node</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Unsloth</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Ray Train integration</span></a>
|
||||
<span class="menu-text">Inference</span></a>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
@@ -298,7 +232,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<li class="sidebar-item sidebar-item-section">
|
||||
<div class="sidebar-item-container">
|
||||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true">
|
||||
<span class="menu-text">Reference</span></a>
|
||||
<span class="menu-text">Deployments</span></a>
|
||||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||||
<i class="bi bi-chevron-right ms-2"></i>
|
||||
</a>
|
||||
@@ -306,18 +240,187 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/multi-gpu.html" class="sidebar-item-text sidebar-link active">
|
||||
<span class="menu-text">Multi-GPU</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/multi-node.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Multi Node</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Ray Train</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">AMD GPUs on HPC Systems</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/mac.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Mac M-series</span></a>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="sidebar-item sidebar-item-section">
|
||||
<div class="sidebar-item-container">
|
||||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="true">
|
||||
<span class="menu-text">How To Guides</span></a>
|
||||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||||
<i class="bi bi-chevron-right ms-2"></i>
|
||||
</a>
|
||||
</div>
|
||||
<ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth1 show">
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/multimodal.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">MultiModal / Vision Language Models (BETA)</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/rlhf.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">RLHF (Beta)</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/reward_modelling.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Reward Modelling</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/lr_groups.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Learning Rate Groups</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/lora_optims.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">LoRA Optimizations</span></a>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="sidebar-item sidebar-item-section">
|
||||
<div class="sidebar-item-container">
|
||||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true">
|
||||
<span class="menu-text">Core Concepts</span></a>
|
||||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||||
<i class="bi bi-chevron-right ms-2"></i>
|
||||
</a>
|
||||
</div>
|
||||
<ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/batch_vs_grad.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Batch size vs Gradient accumulation</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Dataset Preprocessing</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/multipack.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Multipack (Sample Packing)</span></a>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="sidebar-item sidebar-item-section">
|
||||
<div class="sidebar-item-container">
|
||||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
|
||||
<span class="menu-text">Advanced Features</span></a>
|
||||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||||
<i class="bi bi-chevron-right ms-2"></i>
|
||||
</a>
|
||||
</div>
|
||||
<ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">FDSP + QLoRA</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Unsloth</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/torchao.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">PyTorch ao</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/custom_integrations.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Custom Integrations</span></a>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="sidebar-item sidebar-item-section">
|
||||
<div class="sidebar-item-container">
|
||||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true">
|
||||
<span class="menu-text">Troubleshooting</span></a>
|
||||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||||
<i class="bi bi-chevron-right ms-2"></i>
|
||||
</a>
|
||||
</div>
|
||||
<ul id="quarto-sidebar-section-7" class="collapse list-unstyled sidebar-section depth1 show">
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/faq.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">FAQ</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/debugging.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Debugging</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/nccl.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">NCCL</span></a>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="sidebar-item sidebar-item-section">
|
||||
<div class="sidebar-item-container">
|
||||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true">
|
||||
<span class="menu-text">Reference</span></a>
|
||||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||||
<i class="bi bi-chevron-right ms-2"></i>
|
||||
</a>
|
||||
</div>
|
||||
<ul id="quarto-sidebar-section-8" class="collapse list-unstyled sidebar-section depth1 show">
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/config.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Config options</span></a>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/faq.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">FAQ</span></a>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav>
|
||||
@@ -355,9 +458,9 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<!-- main -->
|
||||
<main class="content" id="quarto-document-content">
|
||||
|
||||
<header id="title-block-header" class="quarto-title-block default"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/getting-started.html">How-To Guides</a></li><li class="breadcrumb-item"><a href="../docs/multi-gpu.html">Multi-GPU Training Guide</a></li></ol></nav>
|
||||
<header id="title-block-header" class="quarto-title-block default"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/multi-gpu.html">Deployments</a></li><li class="breadcrumb-item"><a href="../docs/multi-gpu.html">Multi-GPU</a></li></ol></nav>
|
||||
<div class="quarto-title">
|
||||
<div class="quarto-title-block"><div><h1 class="title">Multi-GPU Training Guide</h1><button type="button" class="btn code-tools-button" id="quarto-code-tools-source"><i class="bi"></i> Code</button></div></div>
|
||||
<div class="quarto-title-block"><div><h1 class="title">Multi-GPU</h1><button type="button" class="btn code-tools-button" id="quarto-code-tools-source"><i class="bi"></i> Code</button></div></div>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -394,7 +497,11 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
</section>
|
||||
<section id="sec-deepspeed-usage" class="level3" data-number="2.2">
|
||||
<h3 data-number="2.2" class="anchored" data-anchor-id="sec-deepspeed-usage"><span class="header-section-number">2.2</span> Usage</h3>
|
||||
<div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">accelerate</span> launch <span class="at">-m</span> axolotl.cli.train examples/llama-2/config.yml <span class="at">--deepspeed</span> deepspeed_configs/zero1.json</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Passing arg via config</span></span>
|
||||
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train config.yml</span>
|
||||
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Passing arg via cli</span></span>
|
||||
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train config.yml <span class="at">--deepspeed</span> deepspeed_configs/zero1.json</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
</section>
|
||||
<section id="sec-zero-stages" class="level3" data-number="2.3">
|
||||
<h3 data-number="2.3" class="anchored" data-anchor-id="sec-zero-stages"><span class="header-section-number">2.3</span> ZeRO Stages</h3>
|
||||
@@ -428,32 +535,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<h2 data-number="4" class="anchored" data-anchor-id="sec-performance"><span class="header-section-number">4</span> Performance Optimization</h2>
|
||||
<section id="sec-liger" class="level3" data-number="4.1">
|
||||
<h3 data-number="4.1" class="anchored" data-anchor-id="sec-liger"><span class="header-section-number">4.1</span> Liger Kernel Integration</h3>
|
||||
<div class="callout callout-style-default callout-note callout-titled">
|
||||
<div class="callout-header d-flex align-content-center">
|
||||
<div class="callout-icon-container">
|
||||
<i class="callout-icon"></i>
|
||||
</div>
|
||||
<div class="callout-title-container flex-fill">
|
||||
Note
|
||||
</div>
|
||||
</div>
|
||||
<div class="callout-body-container callout-body">
|
||||
<p>Liger Kernel provides efficient Triton kernels for LLM training, offering:</p>
|
||||
<ul>
|
||||
<li>20% increase in multi-GPU training throughput</li>
|
||||
<li>60% reduction in memory usage</li>
|
||||
<li>Compatibility with both FSDP and DeepSpeed</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<p>Configuration:</p>
|
||||
<div class="sourceCode" id="cb4"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span></span>
|
||||
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> axolotl.integrations.liger.LigerPlugin</span></span>
|
||||
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="fu">liger_rope</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||||
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="fu">liger_rms_norm</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||||
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="fu">liger_glu_activation</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||||
<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="fu">liger_layer_norm</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||||
<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a><span class="fu">liger_fused_linear_cross_entropy</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<p>Please see <a href="../docs/custom_integrations.html#liger">docs</a> for more info.</p>
|
||||
</section>
|
||||
</section>
|
||||
<section id="sec-troubleshooting" class="level2" data-number="5">
|
||||
@@ -961,124 +1043,110 @@ window.document.addEventListener("DOMContentLoaded", function (event) {
|
||||
}
|
||||
});
|
||||
</script><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
|
||||
<div class="sourceCode" id="cb5" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
|
||||
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> "Multi-GPU Training Guide"</span></span>
|
||||
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
|
||||
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co"> html:</span></span>
|
||||
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="co"> toc: true</span></span>
|
||||
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="co"> toc-depth: 3</span></span>
|
||||
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a><span class="co"> number-sections: true</span></span>
|
||||
<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a><span class="co"> code-tools: true</span></span>
|
||||
<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
|
||||
<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a><span class="co"> enabled: false</span></span>
|
||||
<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
|
||||
<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a>This guide covers advanced training configurations for multi-GPU setups using Axolotl.</span>
|
||||
<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a><span class="fu">## Overview {#sec-overview}</span></span>
|
||||
<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a>Axolotl supports several methods for multi-GPU training:</span>
|
||||
<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>DeepSpeed (recommended)</span>
|
||||
<span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>FSDP (Fully Sharded Data Parallel)</span>
|
||||
<span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>FSDP + QLoRA</span>
|
||||
<span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a><span class="fu">## DeepSpeed {#sec-deepspeed}</span></span>
|
||||
<span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-25"><a href="#cb5-25" aria-hidden="true" tabindex="-1"></a>DeepSpeed is the recommended approach for multi-GPU training due to its stability and performance. It provides various optimization levels through ZeRO stages.</span>
|
||||
<span id="cb5-26"><a href="#cb5-26" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-27"><a href="#cb5-27" aria-hidden="true" tabindex="-1"></a><span class="fu">### Configuration {#sec-deepspeed-config}</span></span>
|
||||
<span id="cb5-28"><a href="#cb5-28" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-29"><a href="#cb5-29" aria-hidden="true" tabindex="-1"></a>Add to your YAML config:</span>
|
||||
<span id="cb5-30"><a href="#cb5-30" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-31"><a href="#cb5-31" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
|
||||
<span id="cb5-32"><a href="#cb5-32" aria-hidden="true" tabindex="-1"></a><span class="in">deepspeed: deepspeed_configs/zero1.json</span></span>
|
||||
<span id="cb5-33"><a href="#cb5-33" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
|
||||
<span id="cb5-34"><a href="#cb5-34" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-35"><a href="#cb5-35" aria-hidden="true" tabindex="-1"></a><span class="fu">### Usage {#sec-deepspeed-usage}</span></span>
|
||||
<span id="cb5-36"><a href="#cb5-36" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-37"><a href="#cb5-37" aria-hidden="true" tabindex="-1"></a><span class="in">```{.bash}</span></span>
|
||||
<span id="cb5-38"><a href="#cb5-38" aria-hidden="true" tabindex="-1"></a><span class="in">accelerate launch -m axolotl.cli.train examples/llama-2/config.yml --deepspeed deepspeed_configs/zero1.json</span></span>
|
||||
<span id="cb5-39"><a href="#cb5-39" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
|
||||
<span id="cb5-40"><a href="#cb5-40" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-41"><a href="#cb5-41" aria-hidden="true" tabindex="-1"></a><span class="fu">### ZeRO Stages {#sec-zero-stages}</span></span>
|
||||
<span id="cb5-42"><a href="#cb5-42" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-43"><a href="#cb5-43" aria-hidden="true" tabindex="-1"></a>We provide default configurations for:</span>
|
||||
<span id="cb5-44"><a href="#cb5-44" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-45"><a href="#cb5-45" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 1 (<span class="in">`zero1.json`</span>)</span>
|
||||
<span id="cb5-46"><a href="#cb5-46" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 2 (<span class="in">`zero2.json`</span>)</span>
|
||||
<span id="cb5-47"><a href="#cb5-47" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 3 (<span class="in">`zero3.json`</span>)</span>
|
||||
<span id="cb5-48"><a href="#cb5-48" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-49"><a href="#cb5-49" aria-hidden="true" tabindex="-1"></a>Choose based on your memory requirements and performance needs.</span>
|
||||
<span id="cb5-50"><a href="#cb5-50" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-51"><a href="#cb5-51" aria-hidden="true" tabindex="-1"></a><span class="fu">## FSDP {#sec-fsdp}</span></span>
|
||||
<span id="cb5-52"><a href="#cb5-52" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-53"><a href="#cb5-53" aria-hidden="true" tabindex="-1"></a><span class="fu">### Basic FSDP Configuration {#sec-fsdp-config}</span></span>
|
||||
<span id="cb5-54"><a href="#cb5-54" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-55"><a href="#cb5-55" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
|
||||
<span id="cb5-56"><a href="#cb5-56" aria-hidden="true" tabindex="-1"></a><span class="in">fsdp:</span></span>
|
||||
<span id="cb5-57"><a href="#cb5-57" aria-hidden="true" tabindex="-1"></a><span class="in"> - full_shard</span></span>
|
||||
<span id="cb5-58"><a href="#cb5-58" aria-hidden="true" tabindex="-1"></a><span class="in"> - auto_wrap</span></span>
|
||||
<span id="cb5-59"><a href="#cb5-59" aria-hidden="true" tabindex="-1"></a><span class="in">fsdp_config:</span></span>
|
||||
<span id="cb5-60"><a href="#cb5-60" aria-hidden="true" tabindex="-1"></a><span class="in"> fsdp_offload_params: true</span></span>
|
||||
<span id="cb5-61"><a href="#cb5-61" aria-hidden="true" tabindex="-1"></a><span class="in"> fsdp_state_dict_type: FULL_STATE_DICT</span></span>
|
||||
<span id="cb5-62"><a href="#cb5-62" aria-hidden="true" tabindex="-1"></a><span class="in"> fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer</span></span>
|
||||
<span id="cb5-63"><a href="#cb5-63" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
|
||||
<span id="cb5-64"><a href="#cb5-64" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-65"><a href="#cb5-65" aria-hidden="true" tabindex="-1"></a><span class="fu">### FSDP + QLoRA {#sec-fsdp-qlora}</span></span>
|
||||
<span id="cb5-66"><a href="#cb5-66" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-67"><a href="#cb5-67" aria-hidden="true" tabindex="-1"></a>For combining FSDP with QLoRA, see our <span class="co">[</span><span class="ot">dedicated guide</span><span class="co">](fsdp_qlora.qmd)</span>.</span>
|
||||
<span id="cb5-68"><a href="#cb5-68" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-69"><a href="#cb5-69" aria-hidden="true" tabindex="-1"></a><span class="fu">## Performance Optimization {#sec-performance}</span></span>
|
||||
<span id="cb5-70"><a href="#cb5-70" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-71"><a href="#cb5-71" aria-hidden="true" tabindex="-1"></a><span class="fu">### Liger Kernel Integration {#sec-liger}</span></span>
|
||||
<span id="cb5-72"><a href="#cb5-72" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-73"><a href="#cb5-73" aria-hidden="true" tabindex="-1"></a>::: {.callout-note}</span>
|
||||
<span id="cb5-74"><a href="#cb5-74" aria-hidden="true" tabindex="-1"></a>Liger Kernel provides efficient Triton kernels for LLM training, offering:</span>
|
||||
<span id="cb5-75"><a href="#cb5-75" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-76"><a href="#cb5-76" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>20% increase in multi-GPU training throughput</span>
|
||||
<span id="cb5-77"><a href="#cb5-77" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>60% reduction in memory usage</span>
|
||||
<span id="cb5-78"><a href="#cb5-78" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Compatibility with both FSDP and DeepSpeed</span>
|
||||
<span id="cb5-79"><a href="#cb5-79" aria-hidden="true" tabindex="-1"></a>:::</span>
|
||||
<span id="cb5-80"><a href="#cb5-80" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-81"><a href="#cb5-81" aria-hidden="true" tabindex="-1"></a>Configuration:</span>
|
||||
<span id="cb5-82"><a href="#cb5-82" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-83"><a href="#cb5-83" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
|
||||
<span id="cb5-84"><a href="#cb5-84" aria-hidden="true" tabindex="-1"></a><span class="in">plugins:</span></span>
|
||||
<span id="cb5-85"><a href="#cb5-85" aria-hidden="true" tabindex="-1"></a><span class="in"> - axolotl.integrations.liger.LigerPlugin</span></span>
|
||||
<span id="cb5-86"><a href="#cb5-86" aria-hidden="true" tabindex="-1"></a><span class="in">liger_rope: true</span></span>
|
||||
<span id="cb5-87"><a href="#cb5-87" aria-hidden="true" tabindex="-1"></a><span class="in">liger_rms_norm: true</span></span>
|
||||
<span id="cb5-88"><a href="#cb5-88" aria-hidden="true" tabindex="-1"></a><span class="in">liger_glu_activation: true</span></span>
|
||||
<span id="cb5-89"><a href="#cb5-89" aria-hidden="true" tabindex="-1"></a><span class="in">liger_layer_norm: true</span></span>
|
||||
<span id="cb5-90"><a href="#cb5-90" aria-hidden="true" tabindex="-1"></a><span class="in">liger_fused_linear_cross_entropy: true</span></span>
|
||||
<span id="cb5-91"><a href="#cb5-91" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
|
||||
<span id="cb5-92"><a href="#cb5-92" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-93"><a href="#cb5-93" aria-hidden="true" tabindex="-1"></a><span class="fu">## Troubleshooting {#sec-troubleshooting}</span></span>
|
||||
<span id="cb5-94"><a href="#cb5-94" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-95"><a href="#cb5-95" aria-hidden="true" tabindex="-1"></a><span class="fu">### NCCL Issues {#sec-nccl}</span></span>
|
||||
<span id="cb5-96"><a href="#cb5-96" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-97"><a href="#cb5-97" aria-hidden="true" tabindex="-1"></a>For NCCL-related problems, see our <span class="co">[</span><span class="ot">NCCL troubleshooting guide</span><span class="co">](nccl.qmd)</span>.</span>
|
||||
<span id="cb5-98"><a href="#cb5-98" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-99"><a href="#cb5-99" aria-hidden="true" tabindex="-1"></a><span class="fu">### Common Problems {#sec-common-problems}</span></span>
|
||||
<span id="cb5-100"><a href="#cb5-100" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-101"><a href="#cb5-101" aria-hidden="true" tabindex="-1"></a>::: {.panel-tabset}</span>
|
||||
<span id="cb5-102"><a href="#cb5-102" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-103"><a href="#cb5-103" aria-hidden="true" tabindex="-1"></a><span class="fu">## Memory Issues</span></span>
|
||||
<span id="cb5-104"><a href="#cb5-104" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-105"><a href="#cb5-105" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Reduce <span class="in">`micro_batch_size`</span></span>
|
||||
<span id="cb5-106"><a href="#cb5-106" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Reduce <span class="in">`eval_batch_size`</span></span>
|
||||
<span id="cb5-107"><a href="#cb5-107" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Adjust <span class="in">`gradient_accumulation_steps`</span></span>
|
||||
<span id="cb5-108"><a href="#cb5-108" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Consider using a higher ZeRO stage</span>
|
||||
<span id="cb5-109"><a href="#cb5-109" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-110"><a href="#cb5-110" aria-hidden="true" tabindex="-1"></a><span class="fu">## Training Instability</span></span>
|
||||
<span id="cb5-111"><a href="#cb5-111" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-112"><a href="#cb5-112" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Start with DeepSpeed ZeRO-2</span>
|
||||
<span id="cb5-113"><a href="#cb5-113" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Monitor loss values</span>
|
||||
<span id="cb5-114"><a href="#cb5-114" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Check learning rates</span>
|
||||
<span id="cb5-115"><a href="#cb5-115" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-116"><a href="#cb5-116" aria-hidden="true" tabindex="-1"></a>:::</span>
|
||||
<span id="cb5-117"><a href="#cb5-117" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-118"><a href="#cb5-118" aria-hidden="true" tabindex="-1"></a>For more detailed troubleshooting, see our <span class="co">[</span><span class="ot">debugging guide</span><span class="co">](debugging.qmd)</span>.</span></code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
|
||||
<div class="sourceCode" id="cb4" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
|
||||
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> "Multi-GPU"</span></span>
|
||||
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
|
||||
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co"> html:</span></span>
|
||||
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="co"> toc: true</span></span>
|
||||
<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="co"> toc-depth: 3</span></span>
|
||||
<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a><span class="co"> number-sections: true</span></span>
|
||||
<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a><span class="co"> code-tools: true</span></span>
|
||||
<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
|
||||
<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a><span class="co"> enabled: false</span></span>
|
||||
<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
|
||||
<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>This guide covers advanced training configurations for multi-GPU setups using Axolotl.</span>
|
||||
<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a><span class="fu">## Overview {#sec-overview}</span></span>
|
||||
<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>Axolotl supports several methods for multi-GPU training:</span>
|
||||
<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>DeepSpeed (recommended)</span>
|
||||
<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>FSDP (Fully Sharded Data Parallel)</span>
|
||||
<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>FSDP + QLoRA</span>
|
||||
<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a><span class="fu">## DeepSpeed {#sec-deepspeed}</span></span>
|
||||
<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a>DeepSpeed is the recommended approach for multi-GPU training due to its stability and performance. It provides various optimization levels through ZeRO stages.</span>
|
||||
<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a><span class="fu">### Configuration {#sec-deepspeed-config}</span></span>
|
||||
<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a>Add to your YAML config:</span>
|
||||
<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-31"><a href="#cb4-31" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
|
||||
<span id="cb4-32"><a href="#cb4-32" aria-hidden="true" tabindex="-1"></a><span class="in">deepspeed: deepspeed_configs/zero1.json</span></span>
|
||||
<span id="cb4-33"><a href="#cb4-33" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
|
||||
<span id="cb4-34"><a href="#cb4-34" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-35"><a href="#cb4-35" aria-hidden="true" tabindex="-1"></a><span class="fu">### Usage {#sec-deepspeed-usage}</span></span>
|
||||
<span id="cb4-36"><a href="#cb4-36" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-37"><a href="#cb4-37" aria-hidden="true" tabindex="-1"></a><span class="in">```{.bash}</span></span>
|
||||
<span id="cb4-38"><a href="#cb4-38" aria-hidden="true" tabindex="-1"></a><span class="in"># Passing arg via config</span></span>
|
||||
<span id="cb4-39"><a href="#cb4-39" aria-hidden="true" tabindex="-1"></a><span class="in">axolotl train config.yml</span></span>
|
||||
<span id="cb4-40"><a href="#cb4-40" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-41"><a href="#cb4-41" aria-hidden="true" tabindex="-1"></a><span class="in"># Passing arg via cli</span></span>
|
||||
<span id="cb4-42"><a href="#cb4-42" aria-hidden="true" tabindex="-1"></a><span class="in">axolotl train config.yml --deepspeed deepspeed_configs/zero1.json</span></span>
|
||||
<span id="cb4-43"><a href="#cb4-43" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
|
||||
<span id="cb4-44"><a href="#cb4-44" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-45"><a href="#cb4-45" aria-hidden="true" tabindex="-1"></a><span class="fu">### ZeRO Stages {#sec-zero-stages}</span></span>
|
||||
<span id="cb4-46"><a href="#cb4-46" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-47"><a href="#cb4-47" aria-hidden="true" tabindex="-1"></a>We provide default configurations for:</span>
|
||||
<span id="cb4-48"><a href="#cb4-48" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-49"><a href="#cb4-49" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 1 (<span class="in">`zero1.json`</span>)</span>
|
||||
<span id="cb4-50"><a href="#cb4-50" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 2 (<span class="in">`zero2.json`</span>)</span>
|
||||
<span id="cb4-51"><a href="#cb4-51" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>ZeRO Stage 3 (<span class="in">`zero3.json`</span>)</span>
|
||||
<span id="cb4-52"><a href="#cb4-52" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-53"><a href="#cb4-53" aria-hidden="true" tabindex="-1"></a>Choose based on your memory requirements and performance needs.</span>
|
||||
<span id="cb4-54"><a href="#cb4-54" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-55"><a href="#cb4-55" aria-hidden="true" tabindex="-1"></a><span class="fu">## FSDP {#sec-fsdp}</span></span>
|
||||
<span id="cb4-56"><a href="#cb4-56" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-57"><a href="#cb4-57" aria-hidden="true" tabindex="-1"></a><span class="fu">### Basic FSDP Configuration {#sec-fsdp-config}</span></span>
|
||||
<span id="cb4-58"><a href="#cb4-58" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-59"><a href="#cb4-59" aria-hidden="true" tabindex="-1"></a><span class="in">```{.yaml}</span></span>
|
||||
<span id="cb4-60"><a href="#cb4-60" aria-hidden="true" tabindex="-1"></a><span class="in">fsdp:</span></span>
|
||||
<span id="cb4-61"><a href="#cb4-61" aria-hidden="true" tabindex="-1"></a><span class="in"> - full_shard</span></span>
|
||||
<span id="cb4-62"><a href="#cb4-62" aria-hidden="true" tabindex="-1"></a><span class="in"> - auto_wrap</span></span>
|
||||
<span id="cb4-63"><a href="#cb4-63" aria-hidden="true" tabindex="-1"></a><span class="in">fsdp_config:</span></span>
|
||||
<span id="cb4-64"><a href="#cb4-64" aria-hidden="true" tabindex="-1"></a><span class="in"> fsdp_offload_params: true</span></span>
|
||||
<span id="cb4-65"><a href="#cb4-65" aria-hidden="true" tabindex="-1"></a><span class="in"> fsdp_state_dict_type: FULL_STATE_DICT</span></span>
|
||||
<span id="cb4-66"><a href="#cb4-66" aria-hidden="true" tabindex="-1"></a><span class="in"> fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer</span></span>
|
||||
<span id="cb4-67"><a href="#cb4-67" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
|
||||
<span id="cb4-68"><a href="#cb4-68" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-69"><a href="#cb4-69" aria-hidden="true" tabindex="-1"></a><span class="fu">### FSDP + QLoRA {#sec-fsdp-qlora}</span></span>
|
||||
<span id="cb4-70"><a href="#cb4-70" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-71"><a href="#cb4-71" aria-hidden="true" tabindex="-1"></a>For combining FSDP with QLoRA, see our <span class="co">[</span><span class="ot">dedicated guide</span><span class="co">](fsdp_qlora.qmd)</span>.</span>
|
||||
<span id="cb4-72"><a href="#cb4-72" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-73"><a href="#cb4-73" aria-hidden="true" tabindex="-1"></a><span class="fu">## Performance Optimization {#sec-performance}</span></span>
|
||||
<span id="cb4-74"><a href="#cb4-74" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-75"><a href="#cb4-75" aria-hidden="true" tabindex="-1"></a><span class="fu">### Liger Kernel Integration {#sec-liger}</span></span>
|
||||
<span id="cb4-76"><a href="#cb4-76" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-77"><a href="#cb4-77" aria-hidden="true" tabindex="-1"></a>Please see <span class="co">[</span><span class="ot">docs</span><span class="co">](custom_integrations.qmd#liger)</span> for more info.</span>
|
||||
<span id="cb4-78"><a href="#cb4-78" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-79"><a href="#cb4-79" aria-hidden="true" tabindex="-1"></a><span class="fu">## Troubleshooting {#sec-troubleshooting}</span></span>
|
||||
<span id="cb4-80"><a href="#cb4-80" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-81"><a href="#cb4-81" aria-hidden="true" tabindex="-1"></a><span class="fu">### NCCL Issues {#sec-nccl}</span></span>
|
||||
<span id="cb4-82"><a href="#cb4-82" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-83"><a href="#cb4-83" aria-hidden="true" tabindex="-1"></a>For NCCL-related problems, see our <span class="co">[</span><span class="ot">NCCL troubleshooting guide</span><span class="co">](nccl.qmd)</span>.</span>
|
||||
<span id="cb4-84"><a href="#cb4-84" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-85"><a href="#cb4-85" aria-hidden="true" tabindex="-1"></a><span class="fu">### Common Problems {#sec-common-problems}</span></span>
|
||||
<span id="cb4-86"><a href="#cb4-86" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-87"><a href="#cb4-87" aria-hidden="true" tabindex="-1"></a>::: {.panel-tabset}</span>
|
||||
<span id="cb4-88"><a href="#cb4-88" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-89"><a href="#cb4-89" aria-hidden="true" tabindex="-1"></a><span class="fu">## Memory Issues</span></span>
|
||||
<span id="cb4-90"><a href="#cb4-90" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-91"><a href="#cb4-91" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Reduce <span class="in">`micro_batch_size`</span></span>
|
||||
<span id="cb4-92"><a href="#cb4-92" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Reduce <span class="in">`eval_batch_size`</span></span>
|
||||
<span id="cb4-93"><a href="#cb4-93" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Adjust <span class="in">`gradient_accumulation_steps`</span></span>
|
||||
<span id="cb4-94"><a href="#cb4-94" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Consider using a higher ZeRO stage</span>
|
||||
<span id="cb4-95"><a href="#cb4-95" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-96"><a href="#cb4-96" aria-hidden="true" tabindex="-1"></a><span class="fu">## Training Instability</span></span>
|
||||
<span id="cb4-97"><a href="#cb4-97" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-98"><a href="#cb4-98" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Start with DeepSpeed ZeRO-2</span>
|
||||
<span id="cb4-99"><a href="#cb4-99" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Monitor loss values</span>
|
||||
<span id="cb4-100"><a href="#cb4-100" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Check learning rates</span>
|
||||
<span id="cb4-101"><a href="#cb4-101" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-102"><a href="#cb4-102" aria-hidden="true" tabindex="-1"></a>:::</span>
|
||||
<span id="cb4-103"><a href="#cb4-103" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-104"><a href="#cb4-104" aria-hidden="true" tabindex="-1"></a>For more detailed troubleshooting, see our <span class="co">[</span><span class="ot">debugging guide</span><span class="co">](debugging.qmd)</span>.</span></code><button title="Copy to Clipboard" class="code-copy-button" data-in-quarto-modal=""><i class="bi"></i></button></pre></div>
|
||||
</div></div></div></div></div>
|
||||
</div> <!-- /content -->
|
||||
|
||||
|
||||
Reference in New Issue
Block a user