1270 lines
59 KiB
HTML
1270 lines
59 KiB
HTML
<!DOCTYPE html>
|
||
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
|
||
|
||
<meta charset="utf-8">
|
||
<meta name="generator" content="quarto-1.8.26">
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
||
|
||
<meta name="description" content="How to use Axolotl on multiple machines">
|
||
|
||
<title>Multi Node – Axolotl</title>
|
||
<style>
|
||
code{white-space: pre-wrap;}
|
||
span.smallcaps{font-variant: small-caps;}
|
||
div.columns{display: flex; gap: min(4vw, 1.5em);}
|
||
div.column{flex: auto; overflow-x: auto;}
|
||
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
||
ul.task-list{list-style: none;}
|
||
ul.task-list li input[type="checkbox"] {
|
||
width: 0.8em;
|
||
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
|
||
vertical-align: middle;
|
||
}
|
||
/* CSS for syntax highlighting */
|
||
html { -webkit-text-size-adjust: 100%; }
|
||
pre > code.sourceCode { white-space: pre; position: relative; }
|
||
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
|
||
pre > code.sourceCode > span:empty { height: 1.2em; }
|
||
.sourceCode { overflow: visible; }
|
||
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
||
div.sourceCode { margin: 1em 0; }
|
||
pre.sourceCode { margin: 0; }
|
||
@media screen {
|
||
div.sourceCode { overflow: auto; }
|
||
}
|
||
@media print {
|
||
pre > code.sourceCode { white-space: pre-wrap; }
|
||
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
|
||
}
|
||
pre.numberSource code
|
||
{ counter-reset: source-line 0; }
|
||
pre.numberSource code > span
|
||
{ position: relative; left: -4em; counter-increment: source-line; }
|
||
pre.numberSource code > span > a:first-child::before
|
||
{ content: counter(source-line);
|
||
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
||
border: none; display: inline-block;
|
||
-webkit-touch-callout: none; -webkit-user-select: none;
|
||
-khtml-user-select: none; -moz-user-select: none;
|
||
-ms-user-select: none; user-select: none;
|
||
padding: 0 4px; width: 4em;
|
||
}
|
||
pre.numberSource { margin-left: 3em; padding-left: 4px; }
|
||
div.sourceCode
|
||
{ }
|
||
@media screen {
|
||
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
||
}
|
||
</style>
|
||
|
||
|
||
<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
|
||
<script src="../site_libs/clipboard/clipboard.min.js"></script>
|
||
<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
|
||
<script src="../site_libs/quarto-search/fuse.min.js"></script>
|
||
<script src="../site_libs/quarto-search/quarto-search.js"></script>
|
||
<meta name="quarto:offset" content="../">
|
||
<link href="../favicon.jpg" rel="icon" type="image/jpeg">
|
||
<script src="../site_libs/quarto-html/quarto.js" type="module"></script>
|
||
<script src="../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
|
||
<script src="../site_libs/quarto-html/axe/axe-check.js" type="module"></script>
|
||
<script src="../site_libs/quarto-html/popper.min.js"></script>
|
||
<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
|
||
<script src="../site_libs/quarto-html/anchor.min.js"></script>
|
||
<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
|
||
<link href="../site_libs/quarto-html/quarto-syntax-highlighting-dark-b758ccaa5987ceb1b75504551e579abf.css" rel="stylesheet" id="quarto-text-highlighting-styles">
|
||
<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
|
||
<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
|
||
<link href="../site_libs/bootstrap/bootstrap-08d9eb451d58809f35fda8b852d737d8.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
|
||
<script id="quarto-search-options" type="application/json">{
|
||
"location": "navbar",
|
||
"copy-button": false,
|
||
"collapse-after": 3,
|
||
"panel-placement": "end",
|
||
"type": "overlay",
|
||
"limit": 50,
|
||
"keyboard-shortcut": [
|
||
"f",
|
||
"/",
|
||
"s"
|
||
],
|
||
"show-item-context": false,
|
||
"language": {
|
||
"search-no-results-text": "No results",
|
||
"search-matching-documents-text": "matching documents",
|
||
"search-copy-link-title": "Copy link to search",
|
||
"search-hide-matches-text": "Hide additional matches",
|
||
"search-more-match-text": "more match in this document",
|
||
"search-more-matches-text": "more matches in this document",
|
||
"search-clear-button-title": "Clear",
|
||
"search-text-placeholder": "",
|
||
"search-detached-cancel-button-title": "Cancel",
|
||
"search-submit-button-title": "Submit",
|
||
"search-label": "Search"
|
||
}
|
||
}</script>
|
||
<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-9KYCVJBNMQ"></script>
|
||
|
||
<script type="text/javascript">
|
||
|
||
window.dataLayer = window.dataLayer || [];
|
||
function gtag(){dataLayer.push(arguments);}
|
||
gtag('js', new Date());
|
||
gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
|
||
</script>
|
||
|
||
|
||
<link rel="stylesheet" href="../styles.css">
|
||
</head>
|
||
|
||
<body class="nav-sidebar docked nav-fixed quarto-light">
|
||
|
||
<div id="quarto-search-results"></div>
|
||
<header id="quarto-header" class="headroom fixed-top">
|
||
<nav class="navbar navbar-expand " data-bs-theme="dark">
|
||
<div class="navbar-container container-fluid">
|
||
<div class="navbar-brand-container mx-auto">
|
||
<a href="../index.html" class="navbar-brand navbar-brand-logo">
|
||
<img src="../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo light-content">
|
||
<img src="../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo dark-content">
|
||
</a>
|
||
</div>
|
||
<div class="quarto-navbar-tools tools-wide tools-end">
|
||
<a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
|
||
<a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
|
||
<a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
|
||
</div>
|
||
<div id="quarto-search" class="" title="Search"></div>
|
||
</div> <!-- /container-fluid -->
|
||
</nav>
|
||
<nav class="quarto-secondary-nav">
|
||
<div class="container-fluid d-flex">
|
||
<button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
|
||
<i class="bi bi-layout-text-sidebar-reverse"></i>
|
||
</button>
|
||
<nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/docker.html">Deployments</a></li><li class="breadcrumb-item"><a href="../docs/multi-node.html">Multi Node</a></li></ol></nav>
|
||
<a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
|
||
</a>
|
||
</div>
|
||
</nav>
|
||
</header>
|
||
<!-- content -->
|
||
<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
|
||
<!-- sidebar -->
|
||
<nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
|
||
<div class="sidebar-menu-container">
|
||
<ul class="list-unstyled mt-1">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../index.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Home</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">Getting Started</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/getting-started.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Quickstart</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/installation.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Installation</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/inference.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Inference and Merging</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false">
|
||
<span class="menu-text">Model Guides</span></a>
|
||
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth2 ">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/kimi-linear.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Kimi Linear</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/plano.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Plano Orchestrator</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/mimo.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">MiMo</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/internvl3_5.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">InternVL 3.5</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/olmo3.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">OLMo 3</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/trinity.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Trinity</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/arcee.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Arcee AFM</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false">
|
||
<span class="menu-text">Ministral3</span></a>
|
||
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth3 ">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/ministral3.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Ministral3</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/ministral3/think.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Ministral 3 Thinking</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/ministral3/vision.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Ministral 3 Vision</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false">
|
||
<span class="menu-text">Magistral</span></a>
|
||
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth3 ">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/magistral.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Magistral</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/magistral/think.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Magistral Thinking</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/magistral/vision.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Magistral Vision</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/ministral.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Ministral</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/mistral-small.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Mistral Small 3.1/3.2</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/voxtral.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Voxtral</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/devstral.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Devstral</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/mistral.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Mistral 7B</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/llama-4.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Llama 4</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/llama-2.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Llama 2</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/qwen3-next.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Qwen 3 Next</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/qwen3.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Qwen 3</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/gemma3n.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Gemma 3n</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/apertus.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Apertus</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/gpt-oss.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">GPT-OSS</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/seed-oss.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Seed-OSS</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/phi.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Phi</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/smolvlm2.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">SmolVLM 2</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/granite4.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Granite 4</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/LiquidAI.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Liquid Foundation Models 2</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/hunyuan.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Hunyuan</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/jamba.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Jamba</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/models/orpheus.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Orpheus</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/cli.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Command Line Interface (CLI)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/telemetry.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Telemetry</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/config-reference.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Config Reference</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/api" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">API Reference</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Dataset Formats</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Pre-training</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Instruction Tuning</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Conversation</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Stepwise Supervised Format</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Template-Free</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">Deployments</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/docker.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Docker</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/multi-gpu.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Multi-GPU</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/multi-node.html" class="sidebar-item-text sidebar-link active">
|
||
<span class="menu-text">Multi Node</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Ray Train</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">AMD GPUs on HPC Systems</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/mac.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Mac M-series</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">How To Guides</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-7" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/multimodal.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">MultiModal / Vision Language Models (BETA)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/rlhf.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">RLHF (Beta)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/reward_modelling.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Reward Modelling</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/lr_groups.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Learning Rate Groups</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/lora_optims.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">LoRA Optimizations</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/dataset_loading.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Dataset Loading</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/qat.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Quantization Aware Training (QAT)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/quantize.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Quantization with torchao</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/optimizations.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Optimizations Guide</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">Core Concepts</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-8" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/batch_vs_grad.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Batch size vs Gradient accumulation</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Dataset Preprocessing</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Streaming Datasets</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/multipack.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Multipack (Sample Packing)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/mixed_precision.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Mixed Precision Training</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Optimizers</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">Advanced Features</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-9" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">FSDP + QLoRA</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Unsloth</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/torchao.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">PyTorch ao</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/custom_integrations.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Custom Integrations</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/sequence_parallelism.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Sequence Parallelism</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">N-D Parallelism (Beta)</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">Troubleshooting</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-10" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/faq.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">FAQ</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/debugging.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Debugging</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../docs/nccl.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">NCCL</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
</div>
|
||
</nav>
|
||
<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
|
||
<!-- margin-sidebar -->
|
||
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
|
||
<nav id="TOC" role="doc-toc" class="toc-active">
|
||
<h2 id="toc-title">On this page</h2>
|
||
|
||
<ul>
|
||
<li><a href="#accelerate" id="toc-accelerate" class="nav-link active" data-scroll-target="#accelerate">Accelerate</a></li>
|
||
<li><a href="#raytrain" id="toc-raytrain" class="nav-link" data-scroll-target="#raytrain">Raytrain</a></li>
|
||
<li><a href="#torchrun" id="toc-torchrun" class="nav-link" data-scroll-target="#torchrun">Torchrun</a>
|
||
<ul class="collapse">
|
||
<li><a href="#option-1-new-axolotl-cli-with-launcher-args-recommended" id="toc-option-1-new-axolotl-cli-with-launcher-args-recommended" class="nav-link" data-scroll-target="#option-1-new-axolotl-cli-with-launcher-args-recommended">Option 1: New Axolotl CLI with launcher args (Recommended)</a></li>
|
||
<li><a href="#option-2-direct-torchrun-legacy" id="toc-option-2-direct-torchrun-legacy" class="nav-link" data-scroll-target="#option-2-direct-torchrun-legacy">Option 2: Direct torchrun (Legacy)</a></li>
|
||
</ul></li>
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
<!-- main -->
|
||
<main class="content" id="quarto-document-content">
|
||
|
||
<header id="title-block-header" class="quarto-title-block default"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/docker.html">Deployments</a></li><li class="breadcrumb-item"><a href="../docs/multi-node.html">Multi Node</a></li></ol></nav>
|
||
<div class="quarto-title">
|
||
<h1 class="title">Multi Node</h1>
|
||
</div>
|
||
|
||
<div>
|
||
<div class="description">
|
||
How to use Axolotl on multiple machines
|
||
</div>
|
||
</div>
|
||
|
||
|
||
<div class="quarto-title-meta">
|
||
|
||
|
||
|
||
|
||
</div>
|
||
|
||
|
||
|
||
</header>
|
||
|
||
|
||
<p>The below are three ways to train multi-node in Axolotl.</p>
|
||
<div class="callout callout-style-default callout-important callout-titled">
|
||
<div class="callout-header d-flex align-content-center">
|
||
<div class="callout-icon-container">
|
||
<i class="callout-icon"></i>
|
||
</div>
|
||
<div class="callout-title-container flex-fill">
|
||
Important
|
||
</div>
|
||
</div>
|
||
<div class="callout-body-container callout-body">
|
||
<p>Each machine needs a copy of Axolotl, we suggest using the same commit to ensure compatibility.</p>
|
||
<p>You will also need to have the same configuration file for your model on each machine.</p>
|
||
<p>Make sure the main machine is reachable by other machines.</p>
|
||
</div>
|
||
</div>
|
||
<section id="accelerate" class="level2">
|
||
<h2 class="anchored" data-anchor-id="accelerate">Accelerate</h2>
|
||
<p>You will need to create a configuration for accelerate, either by using <code>accelerate config</code> and follow the instructions or you can use one of the preset below:</p>
|
||
<p>~/.cache/huggingface/accelerate/default_config.yaml</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">compute_environment</span><span class="kw">:</span><span class="at"> LOCAL_MACHINE</span></span>
|
||
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">debug</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
|
||
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">distributed_type</span><span class="kw">:</span><span class="at"> FSDP</span></span>
|
||
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">downcast_bf16</span><span class="kw">:</span><span class="at"> </span><span class="st">'no'</span></span>
|
||
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">machine_rank</span><span class="kw">:</span><span class="at"> </span><span class="dv">0</span><span class="co"> # Set to 0 for the main machine, increment by one for other machines</span></span>
|
||
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="fu">main_process_ip</span><span class="kw">:</span><span class="at"> </span><span class="fl">10.0.0.4</span><span class="co"> # Set to main machine's IP</span></span>
|
||
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="fu">main_process_port</span><span class="kw">:</span><span class="at"> </span><span class="dv">5000</span></span>
|
||
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="fu">main_training_function</span><span class="kw">:</span><span class="at"> main</span></span>
|
||
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="fu">mixed_precision</span><span class="kw">:</span><span class="at"> bf16</span></span>
|
||
<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="fu">num_machines</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span><span class="co"> # Change to the number of machines</span></span>
|
||
<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="fu">num_processes</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span><span class="co"> # That's the total number of GPUs, (for example: if you have 2 machines with 4 GPU, put 8)</span></span>
|
||
<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="fu">rdzv_backend</span><span class="kw">:</span><span class="at"> static</span></span>
|
||
<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="fu">same_network</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||
<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="fu">tpu_env</span><span class="kw">:</span><span class="at"> </span><span class="kw">[]</span></span>
|
||
<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="fu">tpu_use_cluster</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
|
||
<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="fu">tpu_use_sudo</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
|
||
<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="fu">use_cpu</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
<p>Configure your model to use FSDP in the Axolotl yaml. For example:</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
|
||
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
|
||
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">offload_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">state_dict_type</span><span class="kw">:</span><span class="at"> FULL_STATE_DICT</span></span>
|
||
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">auto_wrap_policy</span><span class="kw">:</span><span class="at"> TRANSFORMER_BASED_WRAP</span></span>
|
||
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> LlamaDecoderLayer</span></span>
|
||
<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reshard_after_forward</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
<p>All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.</p>
|
||
</section>
|
||
<section id="raytrain" class="level2">
|
||
<h2 class="anchored" data-anchor-id="raytrain">Raytrain</h2>
|
||
<p>Please see ray train doc <a href="../docs/ray-integration.html">here</a>.</p>
|
||
</section>
|
||
<section id="torchrun" class="level2">
|
||
<h2 class="anchored" data-anchor-id="torchrun">Torchrun</h2>
|
||
<p>If you are using Infiniband, we recommend torchrun to utilize the full bandwidth.</p>
|
||
<p>Set the following env (change buffersize/socketname depending on your system):</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">NCCL_IB_DISABLE</span><span class="op">=</span>0</span>
|
||
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">NCCL_SOCKET_IFNAME</span><span class="op">=</span><span class="st">"eth0,en,eth,em,bond"</span></span>
|
||
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">NCCL_BUFFSIZE</span><span class="op">=</span>2097152</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
<p>Run the following on each node:</p>
|
||
<section id="option-1-new-axolotl-cli-with-launcher-args-recommended" class="level3">
|
||
<h3 class="anchored" data-anchor-id="option-1-new-axolotl-cli-with-launcher-args-recommended">Option 1: New Axolotl CLI with launcher args (Recommended)</h3>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train config.yaml <span class="at">--launcher</span> torchrun <span class="at">--</span> <span class="at">--nnodes</span> <span class="va">$num_nodes</span> <span class="at">--nproc_per_node</span> <span class="va">$gpu_per_node</span> <span class="at">--rdzv_id</span> <span class="va">$rdzv_id</span> <span class="at">--rdzv_backend</span> c10d <span class="at">--rdzv_endpoint</span> <span class="st">"</span><span class="va">$head_node_ip</span><span class="st">:</span><span class="va">$head_node_port</span><span class="st">"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
</section>
|
||
<section id="option-2-direct-torchrun-legacy" class="level3">
|
||
<h3 class="anchored" data-anchor-id="option-2-direct-torchrun-legacy">Option 2: Direct torchrun (Legacy)</h3>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="ex">torchrun</span> <span class="at">--nnodes</span> <span class="va">$num_nodes</span> <span class="at">--nproc_per_node</span> <span class="va">$gpu_per_node</span> <span class="at">--rdzv_id</span> <span class="va">$rdzv_id</span> <span class="at">--rdzv_backend</span> c10d <span class="at">--rdzv_endpoint</span> <span class="st">"</span><span class="va">$head_node_ip</span><span class="st">:</span><span class="va">$head_node_port</span><span class="st">"</span> <span class="at">-m</span> axolotl.cli.train config.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
<p>Please make sure to substitute the placeholder variables:</p>
|
||
<ul>
|
||
<li><code>num_nodes</code>: Number of nodes (containing GPUs)</li>
|
||
<li><code>gpu_per_node</code>: Number of gpus per node</li>
|
||
<li><code>head_node_ip</code>: IP of the head node (make sure other machines can connect to this)</li>
|
||
<li><code>head_node_port</code>: Port of the head node (make sure other machines can connect to this. Default 29400)</li>
|
||
<li><code>rdzv_id</code>: A unique job ID that is used by the job across nodes.</li>
|
||
</ul>
|
||
<p>The new CLI approach (Option 1) is recommended as it provides consistent argument handling and works seamlessly with other Axolotl CLI features.</p>
|
||
<p>More info on the available configs can be found on the Pytorch docs <a href="https://pytorch.org/docs/stable/elastic/run.html">here</a></p>
|
||
|
||
|
||
</section>
|
||
</section>
|
||
|
||
</main> <!-- /main -->
|
||
<script id="quarto-html-after-body" type="application/javascript">
|
||
window.document.addEventListener("DOMContentLoaded", function (event) {
|
||
const icon = "";
|
||
const anchorJS = new window.AnchorJS();
|
||
anchorJS.options = {
|
||
placement: 'right',
|
||
icon: icon
|
||
};
|
||
anchorJS.add('.anchored');
|
||
const isCodeAnnotation = (el) => {
|
||
for (const clz of el.classList) {
|
||
if (clz.startsWith('code-annotation-')) {
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
const onCopySuccess = function(e) {
|
||
// button target
|
||
const button = e.trigger;
|
||
// don't keep focus
|
||
button.blur();
|
||
// flash "checked"
|
||
button.classList.add('code-copy-button-checked');
|
||
var currentTitle = button.getAttribute("title");
|
||
button.setAttribute("title", "Copied!");
|
||
let tooltip;
|
||
if (window.bootstrap) {
|
||
button.setAttribute("data-bs-toggle", "tooltip");
|
||
button.setAttribute("data-bs-placement", "left");
|
||
button.setAttribute("data-bs-title", "Copied!");
|
||
tooltip = new bootstrap.Tooltip(button,
|
||
{ trigger: "manual",
|
||
customClass: "code-copy-button-tooltip",
|
||
offset: [0, -8]});
|
||
tooltip.show();
|
||
}
|
||
setTimeout(function() {
|
||
if (tooltip) {
|
||
tooltip.hide();
|
||
button.removeAttribute("data-bs-title");
|
||
button.removeAttribute("data-bs-toggle");
|
||
button.removeAttribute("data-bs-placement");
|
||
}
|
||
button.setAttribute("title", currentTitle);
|
||
button.classList.remove('code-copy-button-checked');
|
||
}, 1000);
|
||
// clear code selection
|
||
e.clearSelection();
|
||
}
|
||
const getTextToCopy = function(trigger) {
|
||
const outerScaffold = trigger.parentElement.cloneNode(true);
|
||
const codeEl = outerScaffold.querySelector('code');
|
||
for (const childEl of codeEl.children) {
|
||
if (isCodeAnnotation(childEl)) {
|
||
childEl.remove();
|
||
}
|
||
}
|
||
return codeEl.innerText;
|
||
}
|
||
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
|
||
text: getTextToCopy
|
||
});
|
||
clipboard.on('success', onCopySuccess);
|
||
if (window.document.getElementById('quarto-embedded-source-code-modal')) {
|
||
const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
|
||
text: getTextToCopy,
|
||
container: window.document.getElementById('quarto-embedded-source-code-modal')
|
||
});
|
||
clipboardModal.on('success', onCopySuccess);
|
||
}
|
||
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
|
||
var mailtoRegex = new RegExp(/^mailto:/);
|
||
var filterRegex = new RegExp("https:\/\/docs\.axolotl\.ai");
|
||
var isInternal = (href) => {
|
||
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
|
||
}
|
||
// Inspect non-navigation links and adorn them if external
|
||
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
|
||
for (var i=0; i<links.length; i++) {
|
||
const link = links[i];
|
||
if (!isInternal(link.href)) {
|
||
// undo the damage that might have been done by quarto-nav.js in the case of
|
||
// links that we want to consider external
|
||
if (link.dataset.originalHref !== undefined) {
|
||
link.href = link.dataset.originalHref;
|
||
}
|
||
}
|
||
}
|
||
function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
|
||
const config = {
|
||
allowHTML: true,
|
||
maxWidth: 500,
|
||
delay: 100,
|
||
arrow: false,
|
||
appendTo: function(el) {
|
||
return el.parentElement;
|
||
},
|
||
interactive: true,
|
||
interactiveBorder: 10,
|
||
theme: 'quarto',
|
||
placement: 'bottom-start',
|
||
};
|
||
if (contentFn) {
|
||
config.content = contentFn;
|
||
}
|
||
if (onTriggerFn) {
|
||
config.onTrigger = onTriggerFn;
|
||
}
|
||
if (onUntriggerFn) {
|
||
config.onUntrigger = onUntriggerFn;
|
||
}
|
||
window.tippy(el, config);
|
||
}
|
||
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
|
||
for (var i=0; i<noterefs.length; i++) {
|
||
const ref = noterefs[i];
|
||
tippyHover(ref, function() {
|
||
// use id or data attribute instead here
|
||
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
|
||
try { href = new URL(href).hash; } catch {}
|
||
const id = href.replace(/^#\/?/, "");
|
||
const note = window.document.getElementById(id);
|
||
if (note) {
|
||
return note.innerHTML;
|
||
} else {
|
||
return "";
|
||
}
|
||
});
|
||
}
|
||
const xrefs = window.document.querySelectorAll('a.quarto-xref');
|
||
const processXRef = (id, note) => {
|
||
// Strip column container classes
|
||
const stripColumnClz = (el) => {
|
||
el.classList.remove("page-full", "page-columns");
|
||
if (el.children) {
|
||
for (const child of el.children) {
|
||
stripColumnClz(child);
|
||
}
|
||
}
|
||
}
|
||
stripColumnClz(note)
|
||
if (id === null || id.startsWith('sec-')) {
|
||
// Special case sections, only their first couple elements
|
||
const container = document.createElement("div");
|
||
if (note.children && note.children.length > 2) {
|
||
container.appendChild(note.children[0].cloneNode(true));
|
||
for (let i = 1; i < note.children.length; i++) {
|
||
const child = note.children[i];
|
||
if (child.tagName === "P" && child.innerText === "") {
|
||
continue;
|
||
} else {
|
||
container.appendChild(child.cloneNode(true));
|
||
break;
|
||
}
|
||
}
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(container);
|
||
}
|
||
return container.innerHTML
|
||
} else {
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(note);
|
||
}
|
||
return note.innerHTML;
|
||
}
|
||
} else {
|
||
// Remove any anchor links if they are present
|
||
const anchorLink = note.querySelector('a.anchorjs-link');
|
||
if (anchorLink) {
|
||
anchorLink.remove();
|
||
}
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(note);
|
||
}
|
||
if (note.classList.contains("callout")) {
|
||
return note.outerHTML;
|
||
} else {
|
||
return note.innerHTML;
|
||
}
|
||
}
|
||
}
|
||
for (var i=0; i<xrefs.length; i++) {
|
||
const xref = xrefs[i];
|
||
tippyHover(xref, undefined, function(instance) {
|
||
instance.disable();
|
||
let url = xref.getAttribute('href');
|
||
let hash = undefined;
|
||
if (url.startsWith('#')) {
|
||
hash = url;
|
||
} else {
|
||
try { hash = new URL(url).hash; } catch {}
|
||
}
|
||
if (hash) {
|
||
const id = hash.replace(/^#\/?/, "");
|
||
const note = window.document.getElementById(id);
|
||
if (note !== null) {
|
||
try {
|
||
const html = processXRef(id, note.cloneNode(true));
|
||
instance.setContent(html);
|
||
} finally {
|
||
instance.enable();
|
||
instance.show();
|
||
}
|
||
} else {
|
||
// See if we can fetch this
|
||
fetch(url.split('#')[0])
|
||
.then(res => res.text())
|
||
.then(html => {
|
||
const parser = new DOMParser();
|
||
const htmlDoc = parser.parseFromString(html, "text/html");
|
||
const note = htmlDoc.getElementById(id);
|
||
if (note !== null) {
|
||
const html = processXRef(id, note);
|
||
instance.setContent(html);
|
||
}
|
||
}).finally(() => {
|
||
instance.enable();
|
||
instance.show();
|
||
});
|
||
}
|
||
} else {
|
||
// See if we can fetch a full url (with no hash to target)
|
||
// This is a special case and we should probably do some content thinning / targeting
|
||
fetch(url)
|
||
.then(res => res.text())
|
||
.then(html => {
|
||
const parser = new DOMParser();
|
||
const htmlDoc = parser.parseFromString(html, "text/html");
|
||
const note = htmlDoc.querySelector('main.content');
|
||
if (note !== null) {
|
||
// This should only happen for chapter cross references
|
||
// (since there is no id in the URL)
|
||
// remove the first header
|
||
if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
|
||
note.children[0].remove();
|
||
}
|
||
const html = processXRef(null, note);
|
||
instance.setContent(html);
|
||
}
|
||
}).finally(() => {
|
||
instance.enable();
|
||
instance.show();
|
||
});
|
||
}
|
||
}, function(instance) {
|
||
});
|
||
}
|
||
let selectedAnnoteEl;
|
||
const selectorForAnnotation = ( cell, annotation) => {
|
||
let cellAttr = 'data-code-cell="' + cell + '"';
|
||
let lineAttr = 'data-code-annotation="' + annotation + '"';
|
||
const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
|
||
return selector;
|
||
}
|
||
const selectCodeLines = (annoteEl) => {
|
||
const doc = window.document;
|
||
const targetCell = annoteEl.getAttribute("data-target-cell");
|
||
const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
|
||
const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
|
||
const lines = annoteSpan.getAttribute("data-code-lines").split(",");
|
||
const lineIds = lines.map((line) => {
|
||
return targetCell + "-" + line;
|
||
})
|
||
let top = null;
|
||
let height = null;
|
||
let parent = null;
|
||
if (lineIds.length > 0) {
|
||
//compute the position of the single el (top and bottom and make a div)
|
||
const el = window.document.getElementById(lineIds[0]);
|
||
top = el.offsetTop;
|
||
height = el.offsetHeight;
|
||
parent = el.parentElement.parentElement;
|
||
if (lineIds.length > 1) {
|
||
const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
|
||
const bottom = lastEl.offsetTop + lastEl.offsetHeight;
|
||
height = bottom - top;
|
||
}
|
||
if (top !== null && height !== null && parent !== null) {
|
||
// cook up a div (if necessary) and position it
|
||
let div = window.document.getElementById("code-annotation-line-highlight");
|
||
if (div === null) {
|
||
div = window.document.createElement("div");
|
||
div.setAttribute("id", "code-annotation-line-highlight");
|
||
div.style.position = 'absolute';
|
||
parent.appendChild(div);
|
||
}
|
||
div.style.top = top - 2 + "px";
|
||
div.style.height = height + 4 + "px";
|
||
div.style.left = 0;
|
||
let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
|
||
if (gutterDiv === null) {
|
||
gutterDiv = window.document.createElement("div");
|
||
gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
|
||
gutterDiv.style.position = 'absolute';
|
||
const codeCell = window.document.getElementById(targetCell);
|
||
const gutter = codeCell.querySelector('.code-annotation-gutter');
|
||
gutter.appendChild(gutterDiv);
|
||
}
|
||
gutterDiv.style.top = top - 2 + "px";
|
||
gutterDiv.style.height = height + 4 + "px";
|
||
}
|
||
selectedAnnoteEl = annoteEl;
|
||
}
|
||
};
|
||
const unselectCodeLines = () => {
|
||
const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
|
||
elementsIds.forEach((elId) => {
|
||
const div = window.document.getElementById(elId);
|
||
if (div) {
|
||
div.remove();
|
||
}
|
||
});
|
||
selectedAnnoteEl = undefined;
|
||
};
|
||
// Handle positioning of the toggle
|
||
window.addEventListener(
|
||
"resize",
|
||
throttle(() => {
|
||
elRect = undefined;
|
||
if (selectedAnnoteEl) {
|
||
selectCodeLines(selectedAnnoteEl);
|
||
}
|
||
}, 10)
|
||
);
|
||
function throttle(fn, ms) {
|
||
let throttle = false;
|
||
let timer;
|
||
return (...args) => {
|
||
if(!throttle) { // first call gets through
|
||
fn.apply(this, args);
|
||
throttle = true;
|
||
} else { // all the others get throttled
|
||
if(timer) clearTimeout(timer); // cancel #2
|
||
timer = setTimeout(() => {
|
||
fn.apply(this, args);
|
||
timer = throttle = false;
|
||
}, ms);
|
||
}
|
||
};
|
||
}
|
||
// Attach click handler to the DT
|
||
const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
|
||
for (const annoteDlNode of annoteDls) {
|
||
annoteDlNode.addEventListener('click', (event) => {
|
||
const clickedEl = event.target;
|
||
if (clickedEl !== selectedAnnoteEl) {
|
||
unselectCodeLines();
|
||
const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
|
||
if (activeEl) {
|
||
activeEl.classList.remove('code-annotation-active');
|
||
}
|
||
selectCodeLines(clickedEl);
|
||
clickedEl.classList.add('code-annotation-active');
|
||
} else {
|
||
// Unselect the line
|
||
unselectCodeLines();
|
||
clickedEl.classList.remove('code-annotation-active');
|
||
}
|
||
});
|
||
}
|
||
const findCites = (el) => {
|
||
const parentEl = el.parentElement;
|
||
if (parentEl) {
|
||
const cites = parentEl.dataset.cites;
|
||
if (cites) {
|
||
return {
|
||
el,
|
||
cites: cites.split(' ')
|
||
};
|
||
} else {
|
||
return findCites(el.parentElement)
|
||
}
|
||
} else {
|
||
return undefined;
|
||
}
|
||
};
|
||
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
|
||
for (var i=0; i<bibliorefs.length; i++) {
|
||
const ref = bibliorefs[i];
|
||
const citeInfo = findCites(ref);
|
||
if (citeInfo) {
|
||
tippyHover(citeInfo.el, function() {
|
||
var popup = window.document.createElement('div');
|
||
citeInfo.cites.forEach(function(cite) {
|
||
var citeDiv = window.document.createElement('div');
|
||
citeDiv.classList.add('hanging-indent');
|
||
citeDiv.classList.add('csl-entry');
|
||
var biblioDiv = window.document.getElementById('ref-' + cite);
|
||
if (biblioDiv) {
|
||
citeDiv.innerHTML = biblioDiv.innerHTML;
|
||
}
|
||
popup.appendChild(citeDiv);
|
||
});
|
||
return popup.innerHTML;
|
||
});
|
||
}
|
||
}
|
||
});
|
||
</script>
|
||
</div> <!-- /content -->
|
||
|
||
|
||
|
||
|
||
</body></html> |