1463 lines
73 KiB
HTML
1463 lines
73 KiB
HTML
<!DOCTYPE html>
|
||
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
|
||
|
||
<meta charset="utf-8">
|
||
<meta name="generator" content="quarto-1.9.37">
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
||
|
||
|
||
<title>model_architectures – Axolotl</title>
|
||
<style>
|
||
/* Default styles provided by pandoc.
|
||
** See https://pandoc.org/MANUAL.html#variables-for-html for config info.
|
||
*/
|
||
code{white-space: pre-wrap;}
|
||
span.smallcaps{font-variant: small-caps;}
|
||
div.columns{display: flex; gap: min(4vw, 1.5em);}
|
||
div.column{flex: auto; overflow-x: auto;}
|
||
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
||
ul.task-list{list-style: none;}
|
||
ul.task-list li input[type="checkbox"] {
|
||
width: 0.8em;
|
||
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
|
||
vertical-align: middle;
|
||
}
|
||
/* CSS for syntax highlighting */
|
||
html { -webkit-text-size-adjust: 100%; }
|
||
pre > code.sourceCode { white-space: pre; position: relative; }
|
||
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
|
||
pre > code.sourceCode > span:empty { height: 1.2em; }
|
||
.sourceCode { overflow: visible; }
|
||
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
||
div.sourceCode { margin: 1em 0; }
|
||
pre.sourceCode { margin: 0; }
|
||
@media screen {
|
||
div.sourceCode { overflow: auto; }
|
||
}
|
||
@media print {
|
||
pre > code.sourceCode { white-space: pre-wrap; }
|
||
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
|
||
}
|
||
pre.numberSource code
|
||
{ counter-reset: source-line 0; }
|
||
pre.numberSource code > span
|
||
{ position: relative; left: -4em; counter-increment: source-line; }
|
||
pre.numberSource code > span > a:first-child::before
|
||
{ content: counter(source-line);
|
||
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
||
border: none; display: inline-block;
|
||
-webkit-touch-callout: none; -webkit-user-select: none;
|
||
-khtml-user-select: none; -moz-user-select: none;
|
||
-ms-user-select: none; user-select: none;
|
||
padding: 0 4px; width: 4em;
|
||
}
|
||
pre.numberSource { margin-left: 3em; padding-left: 4px; }
|
||
div.sourceCode
|
||
{ }
|
||
@media screen {
|
||
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
||
}
|
||
</style>
|
||
|
||
|
||
<script src="../../site_libs/quarto-nav/quarto-nav.js"></script>
|
||
<script src="../../site_libs/clipboard/clipboard.min.js"></script>
|
||
<script src="../../site_libs/quarto-search/autocomplete.umd.js"></script>
|
||
<script src="../../site_libs/quarto-search/fuse.min.js"></script>
|
||
<script src="../../site_libs/quarto-search/quarto-search.js"></script>
|
||
<meta name="quarto:offset" content="../../">
|
||
<link href="../../favicon.jpg" rel="icon" type="image/jpeg">
|
||
<script src="../../site_libs/quarto-html/quarto.js" type="module"></script>
|
||
<script src="../../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
|
||
<script src="../../site_libs/quarto-html/popper.min.js"></script>
|
||
<script src="../../site_libs/quarto-html/tippy.umd.min.js"></script>
|
||
<script src="../../site_libs/quarto-html/anchor.min.js"></script>
|
||
<link href="../../site_libs/quarto-html/tippy.css" rel="stylesheet">
|
||
<link href="../../site_libs/quarto-html/quarto-syntax-highlighting-dark-d0ae9245876894da5ac7e18953ecc5cc.css" rel="stylesheet" id="quarto-text-highlighting-styles">
|
||
<script src="../../site_libs/bootstrap/bootstrap.min.js"></script>
|
||
<link href="../../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
|
||
<link href="../../site_libs/bootstrap/bootstrap-ab6ebd6eb475c4578b58908bc314f719.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
|
||
<script id="quarto-search-options" type="application/json">{
|
||
"location": "navbar",
|
||
"copy-button": false,
|
||
"collapse-after": 3,
|
||
"panel-placement": "end",
|
||
"type": "overlay",
|
||
"limit": 50,
|
||
"keyboard-shortcut": [
|
||
"f",
|
||
"/",
|
||
"s"
|
||
],
|
||
"show-item-context": false,
|
||
"language": {
|
||
"search-no-results-text": "No results",
|
||
"search-matching-documents-text": "matching documents",
|
||
"search-copy-link-title": "Copy link to search",
|
||
"search-hide-matches-text": "Hide additional matches",
|
||
"search-more-match-text": "more match in this document",
|
||
"search-more-matches-text": "more matches in this document",
|
||
"search-clear-button-title": "Clear",
|
||
"search-text-placeholder": "",
|
||
"search-detached-cancel-button-title": "Cancel",
|
||
"search-submit-button-title": "Submit",
|
||
"search-label": "Search"
|
||
}
|
||
}</script>
|
||
<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-9KYCVJBNMQ"></script>
|
||
|
||
<script type="text/javascript">
|
||
|
||
window.dataLayer = window.dataLayer || [];
|
||
function gtag(){dataLayer.push(arguments);}
|
||
gtag('js', new Date());
|
||
gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
|
||
</script>
|
||
|
||
|
||
<link rel="stylesheet" href="../../styles.css">
|
||
</head>
|
||
|
||
<body class="nav-sidebar docked nav-fixed quarto-light">
|
||
|
||
<div id="quarto-search-results"></div>
|
||
<header id="quarto-header" class="headroom fixed-top">
|
||
<nav class="navbar navbar-expand " data-bs-theme="dark">
|
||
<div class="navbar-container container-fluid">
|
||
<div class="navbar-brand-container mx-auto">
|
||
<a href="../../index.html" class="navbar-brand navbar-brand-logo">
|
||
<img src="../../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo light-content">
|
||
<img src="../../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo dark-content">
|
||
</a>
|
||
</div>
|
||
<div class="quarto-navbar-tools tools-wide tools-end">
|
||
<a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
|
||
<a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
|
||
<a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
|
||
</div>
|
||
<div id="quarto-search" class="" title="Search"></div>
|
||
</div> <!-- /container-fluid -->
|
||
</nav>
|
||
<nav class="quarto-secondary-nav">
|
||
<div class="container-fluid d-flex">
|
||
<button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
|
||
<i class="bi bi-layout-text-sidebar-reverse"></i>
|
||
</button>
|
||
<nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"></ol></nav>
|
||
<a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
|
||
</a>
|
||
</div>
|
||
</nav>
|
||
</header>
|
||
<!-- content -->
|
||
<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
|
||
<!-- sidebar -->
|
||
<nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
|
||
<div class="sidebar-menu-container">
|
||
<ul class="list-unstyled mt-1">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../index.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Home</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">Getting Started</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/getting-started.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Quickstart</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/choosing_method.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Which Fine-Tuning Method Should I Use?</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/installation.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Installation</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/inference.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Inference and Merging</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false">
|
||
<span class="menu-text">Model Guides</span></a>
|
||
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth2 ">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/kimi-linear.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Kimi Linear</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/plano.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Plano Orchestrator</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/mimo.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">MiMo</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/internvl3_5.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">InternVL 3.5</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/olmo3.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">OLMo 3</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/trinity.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Trinity</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/arcee.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Arcee AFM</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false">
|
||
<span class="menu-text">Ministral3</span></a>
|
||
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth3 ">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/ministral3.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Ministral3</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/ministral3/think.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Ministral 3 Thinking</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/ministral3/vision.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Ministral 3 Vision</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false">
|
||
<span class="menu-text">Magistral</span></a>
|
||
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth3 ">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/magistral.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Magistral</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/magistral/think.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Magistral Thinking</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/magistral/vision.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Magistral Vision</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/ministral.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Ministral</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/mistral-small.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Mistral Small 3.1/3.2</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/voxtral.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Voxtral</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/devstral.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Devstral</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/mistral.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Mistral 7B</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/llama-4.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Llama 4</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/llama-2.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Llama 2</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/qwen3-next.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Qwen 3 Next</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/qwen3.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Qwen 3</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/gemma3n.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Gemma 3n</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/apertus.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Apertus</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/gpt-oss.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">GPT-OSS</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/seed-oss.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Seed-OSS</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/phi.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Phi</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/smolvlm2.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">SmolVLM 2</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/granite4.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Granite 4</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/LiquidAI.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Liquid Foundation Models 2</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/hunyuan.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Hunyuan</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/jamba.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Jamba</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/models/orpheus.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Orpheus</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/cli.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Command Line Interface (CLI)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/telemetry.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Telemetry</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/config-reference.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Config Reference</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/api" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">API Reference</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Dataset Formats</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Pre-training</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Instruction Tuning</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Conversation</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Stepwise Supervised Format</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Template-Free</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">Deployments</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/docker.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Docker</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/multi-gpu.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Multi-GPU</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/multi-node.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Multi Node</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Ray Train</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">AMD GPUs on HPC Systems</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/mac.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Mac M-series</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">How To Guides</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-7" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/multimodal.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">MultiModal / Vision Language Models (BETA)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/rlhf.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">RLHF (Beta)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/grpo.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">GRPO Training</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/ebft.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">EBFT Training</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/vllm_serving.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">vLLM Serving for GRPO Training</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/reward_modelling.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Reward Modelling</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/lr_groups.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Learning Rate Groups</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/lora_optims.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">LoRA Optimizations</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset_loading.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Dataset Loading</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/qat.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Quantization Aware Training (QAT)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/quantize.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Quantization with torchao</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/optimizations.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Optimizations Guide</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">Core Concepts</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-8" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/batch_vs_grad.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Batch size vs Gradient accumulation</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Dataset Preprocessing</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Streaming Datasets</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/multipack.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Multipack (Sample Packing)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/mixed_precision.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Mixed Precision Training</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Optimizers</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Attention</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">Advanced Features</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-9" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">FSDP + QLoRA</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/torchao.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">PyTorch ao</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/custom_integrations.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Custom Integrations</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/sequence_parallelism.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Sequence Parallelism</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Gradient Checkpointing, Activation Offloading, and Layer Offloading</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">N-D Parallelism (Beta)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/expert_quantization.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">MoE Expert Quantization</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">Troubleshooting</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-10" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/faq.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">FAQ</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/training_stability.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Training Stability & Debugging</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/debugging.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Debugging</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/nccl.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">NCCL</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
</div>
|
||
</nav>
|
||
<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
|
||
<!-- margin-sidebar -->
|
||
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
|
||
<nav id="TOC" role="doc-toc" class="toc-active">
|
||
<h2 id="toc-title">On this page</h2>
|
||
|
||
<ul>
|
||
<li><a href="#model-architectures-agent-reference" id="toc-model-architectures-agent-reference" class="nav-link active" data-scroll-target="#model-architectures-agent-reference">Model Architectures — Agent Reference</a>
|
||
<ul class="collapse">
|
||
<li><a href="#vlm-vision-language-model-quick-start" id="toc-vlm-vision-language-model-quick-start" class="nav-link" data-scroll-target="#vlm-vision-language-model-quick-start">VLM (Vision Language Model) Quick Start</a></li>
|
||
<li><a href="#plugins-optimizations" id="toc-plugins-optimizations" class="nav-link" data-scroll-target="#plugins-optimizations">Plugins & Optimizations</a>
|
||
<ul class="collapse">
|
||
<li><a href="#cut-cross-entropy-cce" id="toc-cut-cross-entropy-cce" class="nav-link" data-scroll-target="#cut-cross-entropy-cce">Cut Cross Entropy (CCE)</a></li>
|
||
<li><a href="#scattermoe-kernels" id="toc-scattermoe-kernels" class="nav-link" data-scroll-target="#scattermoe-kernels">ScatterMoE Kernels</a></li>
|
||
</ul></li>
|
||
<li><a href="#gemma-4" id="toc-gemma-4" class="nav-link" data-scroll-target="#gemma-4">Gemma 4</a>
|
||
<ul class="collapse">
|
||
<li><a href="#required-settings" id="toc-required-settings" class="nav-link" data-scroll-target="#required-settings">Required settings</a></li>
|
||
<li><a href="#auto-detection" id="toc-auto-detection" class="nav-link" data-scroll-target="#auto-detection">Auto-detection</a></li>
|
||
<li><a href="#multi-gpu" id="toc-multi-gpu" class="nav-link" data-scroll-target="#multi-gpu">Multi-GPU</a></li>
|
||
<li><a href="#moe-26b-a4b" id="toc-moe-26b-a4b" class="nav-link" data-scroll-target="#moe-26b-a4b">MoE (26B-A4B)</a></li>
|
||
<li><a href="#vlm-vision-training" id="toc-vlm-vision-training" class="nav-link" data-scroll-target="#vlm-vision-training">VLM (Vision) Training</a></li>
|
||
<li><a href="#common-issues" id="toc-common-issues" class="nav-link" data-scroll-target="#common-issues">Common issues</a></li>
|
||
<li><a href="#e2be4b-dense-models" id="toc-e2be4b-dense-models" class="nav-link" data-scroll-target="#e2be4b-dense-models">E2B/E4B dense models</a></li>
|
||
</ul></li>
|
||
<li><a href="#gemma-3" id="toc-gemma-3" class="nav-link" data-scroll-target="#gemma-3">Gemma 3</a></li>
|
||
<li><a href="#qwen-3.5-moe" id="toc-qwen-3.5-moe" class="nav-link" data-scroll-target="#qwen-3.5-moe">Qwen 3.5 MoE</a></li>
|
||
<li><a href="#general-moe-notes" id="toc-general-moe-notes" class="nav-link" data-scroll-target="#general-moe-notes">General MoE Notes</a></li>
|
||
</ul></li>
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
<!-- main -->
|
||
<main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block"></header>
|
||
|
||
|
||
|
||
|
||
<section id="model-architectures-agent-reference" class="level1">
|
||
<h1>Model Architectures — Agent Reference</h1>
|
||
<p>Model-specific quirks, required settings, and known issues. Check this before debugging training failures on specific model families.</p>
|
||
<section id="vlm-vision-language-model-quick-start" class="level2">
|
||
<h2 class="anchored" data-anchor-id="vlm-vision-language-model-quick-start">VLM (Vision Language Model) Quick Start</h2>
|
||
<p>All VLM configs require these four lines:</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">processor_type</span><span class="kw">:</span><span class="at"> AutoProcessor</span></span>
|
||
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">skip_prepare_dataset</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">remove_unused_columns</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
|
||
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
<p>Decision tree for VLM config:</p>
|
||
<pre class="text"><code>Is the model multimodal (has vision/audio encoder)?
|
||
├─ YES: Add `freeze_mm_modules: true` if training text only
|
||
│ Add `chat_template: <model_template>` (e.g. gemma4, qwen3_5, gemma3)
|
||
│ LoRA: use regex `lora_target_modules` to restrict to language model
|
||
└─ NO: Train as a regular text model
|
||
|
||
Is the model MoE (e.g. Gemma4 26B-A4B, Qwen3.5 35B-A3B)?
|
||
├─ YES: Add `lora_target_parameters` for expert LoRA
|
||
│ Consider ScatterMoE kernels (see Plugins section)
|
||
└─ NO: Standard LoRA config</code></pre>
|
||
</section>
|
||
<section id="plugins-optimizations" class="level2">
|
||
<h2 class="anchored" data-anchor-id="plugins-optimizations">Plugins & Optimizations</h2>
|
||
<section id="cut-cross-entropy-cce" class="level3">
|
||
<h3 class="anchored" data-anchor-id="cut-cross-entropy-cce">Cut Cross Entropy (CCE)</h3>
|
||
<p>Computes loss from hidden states + lm_head weight without materializing the full logits tensor, saving significant VRAM. Install if not already present:</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="st">"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@main"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span></span>
|
||
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
</section>
|
||
<section id="scattermoe-kernels" class="level3">
|
||
<h3 class="anchored" data-anchor-id="scattermoe-kernels">ScatterMoE Kernels</h3>
|
||
<p>Fuses expert + LoRA computation into a single kernel for MoE models. Significant speedup for models with many experts.</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span></span>
|
||
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> axolotl.integrations.kernels.KernelsPlugin</span></span>
|
||
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="fu">use_scattermoe</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="fu">experts_implementation</span><span class="kw">:</span><span class="at"> scattermoe</span></span>
|
||
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Expert LoRA targets (3D parameter tensors, not nn.Linear):</span></span>
|
||
<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_parameters</span><span class="kw">:</span></span>
|
||
<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> experts.gate_up_proj</span></span>
|
||
<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> experts.down_proj</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
<p>Supported: Gemma4 (<code>gemma4_text</code>), Mixtral, Qwen MoE variants. The plugin auto-detects model type and routing function. Without ScatterMoE, expert LoRA still works but runs base expert matmul and LoRA as separate operations.</p>
|
||
</section>
|
||
</section>
|
||
<section id="gemma-4" class="level2">
|
||
<h2 class="anchored" data-anchor-id="gemma-4">Gemma 4</h2>
|
||
<p><strong>Models</strong>: <code>google/gemma-4-26B-A4B</code> (MoE), <code>google/gemma-4-31B</code> (dense), <code>google/gemma-4-E2B</code>, <code>google/gemma-4-E4B</code></p>
|
||
<p><strong>Architecture</strong>: Multimodal wrapper (<code>Gemma4ForConditionalGeneration</code>) over a text backbone (<code>Gemma4TextModel</code>), with optional vision/audio encoders. All Gemma4 HF repos have <code>model_type: "gemma4"</code> — even text-only variants load as multimodal with a vision tower.</p>
|
||
<section id="required-settings" class="level3">
|
||
<h3 class="anchored" data-anchor-id="required-settings">Required settings</h3>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb6"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Always needed for Gemma4:</span></span>
|
||
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="fu">freeze_mm_modules</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Freeze vision/audio encoders for text-only training</span></span>
|
||
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing_kwargs</span><span class="kw">:</span></span>
|
||
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_reentrant</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span><span class="co"> # Shared per-layer norms cause "marked ready twice" with reentrant</span></span>
|
||
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="co"># LoRA target — restrict to language model only (DO NOT use lora_target_linear: true):</span></span>
|
||
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_modules</span><span class="kw">:</span><span class="at"> </span><span class="st">'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
</section>
|
||
<section id="auto-detection" class="level3">
|
||
<h3 class="anchored" data-anchor-id="auto-detection">Auto-detection</h3>
|
||
<p>Axolotl auto-detects Gemma4 and applies:
|
||
- <code>use_reentrant: false</code> for gradient checkpointing
|
||
- <code>ddp_find_unused_parameters: true</code> for DDP (skipped when <code>activation_offloading: true</code>)</p>
|
||
</section>
|
||
<section id="multi-gpu" class="level3">
|
||
<h3 class="anchored" data-anchor-id="multi-gpu">Multi-GPU</h3>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 40%">
|
||
<col style="width: 32%">
|
||
<col style="width: 28%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Strategy</th>
|
||
<th>Works?</th>
|
||
<th>Notes</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td>DDP</td>
|
||
<td>Yes</td>
|
||
<td>Auto-sets <code>ddp_find_unused_parameters=True</code></td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>DDP + activation_offloading</td>
|
||
<td>Yes</td>
|
||
<td><code>find_unused_parameters</code> is skipped (conflicts with checkpoint wrappers)</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>FSDP1</td>
|
||
<td>No</td>
|
||
<td>OOM during dequantization/sharding with QLoRA</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>FSDP2</td>
|
||
<td>Yes</td>
|
||
<td>Use <code>Gemma4TextDecoderLayer</code> (not <code>Gemma4DecoderLayer</code>) as wrap class</td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>FSDP2 + activation_offloading</td>
|
||
<td>Yes</td>
|
||
<td>Lowest VRAM (~26 GiB/GPU for 26B-A4B)</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>FSDP2 config:</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span></span>
|
||
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> full_shard</span></span>
|
||
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> auto_wrap</span></span>
|
||
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
|
||
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
|
||
<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">fsdp_auto_wrap_policy</span><span class="kw">:</span><span class="at"> TRANSFORMER_BASED_WRAP</span></span>
|
||
<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">fsdp_transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> Gemma4TextDecoderLayer</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
</section>
|
||
<section id="moe-26b-a4b" class="level3">
|
||
<h3 class="anchored" data-anchor-id="moe-26b-a4b">MoE (26B-A4B)</h3>
|
||
<ul>
|
||
<li><p><code>enable_moe_block: true</code>, 256 experts, top-k routing</p></li>
|
||
<li><p>No separate <code>SparseMoeBlock</code> — MoE is embedded in each decoder layer</p></li>
|
||
<li><p>Expert LoRA targets 3D parameter tensors:</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb8"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_parameters</span><span class="kw">:</span></span>
|
||
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> experts.gate_up_proj</span></span>
|
||
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> experts.down_proj</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
|
||
<li><p>ScatterMoE kernel acceleration:</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb9"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span></span>
|
||
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> axolotl.integrations.kernels.KernelsPlugin</span></span>
|
||
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a><span class="fu">use_scattermoe</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||
<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a><span class="fu">experts_implementation</span><span class="kw">:</span><span class="at"> scattermoe</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
|
||
</ul>
|
||
</section>
|
||
<section id="vlm-vision-training" class="level3">
|
||
<h3 class="anchored" data-anchor-id="vlm-vision-training">VLM (Vision) Training</h3>
|
||
<p>All Gemma4 models load as <code>Gemma4ForConditionalGeneration</code> with a vision tower. No custom <code>ProcessingStrategy</code> needed — the base class auto-detects the image token.</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb10"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> google/gemma-4-E2B-it</span><span class="co"> # or E4B-it, 26B-A4B</span></span>
|
||
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="fu">processor_type</span><span class="kw">:</span><span class="at"> AutoProcessor</span></span>
|
||
<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a><span class="fu">freeze_mm_modules</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||
<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> gemma4</span></span>
|
||
<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a><span class="fu">skip_prepare_dataset</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||
<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a><span class="fu">remove_unused_columns</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
|
||
<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
<p>A starting VLM loss of ~8-15 is typical. In most runs, loss converges below 1.0 within ~30-50 steps, though results may vary across configurations.</p>
|
||
<p>For the 26B-A4B MoE variant with ScatterMoE + expert LoRA + CCE, add:</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb11"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span></span>
|
||
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin</span></span>
|
||
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> axolotl.integrations.kernels.KernelsPlugin</span></span>
|
||
<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||
<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a><span class="fu">use_scattermoe</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
|
||
<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a><span class="fu">experts_implementation</span><span class="kw">:</span><span class="at"> scattermoe</span></span>
|
||
<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_parameters</span><span class="kw">:</span></span>
|
||
<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> experts.gate_up_proj</span></span>
|
||
<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> experts.down_proj</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||
</section>
|
||
<section id="common-issues" class="level3">
|
||
<h3 class="anchored" data-anchor-id="common-issues">Common issues</h3>
|
||
<table class="caption-top table">
|
||
<colgroup>
|
||
<col style="width: 42%">
|
||
<col style="width: 33%">
|
||
<col style="width: 23%">
|
||
</colgroup>
|
||
<thead>
|
||
<tr class="header">
|
||
<th>Symptom</th>
|
||
<th>Cause</th>
|
||
<th>Fix</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr class="odd">
|
||
<td><code>mm_token_type_ids is required</code> in DDP</td>
|
||
<td><code>model.config</code> not accessible through DDP wrapper</td>
|
||
<td>Already fixed — <code>unwrap_model()</code> in <code>compute_loss</code> and <code>prediction_step</code></td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td><code>marked a variable ready twice</code> in DDP</td>
|
||
<td><code>ddp_find_unused_parameters=True</code> + activation_offloading checkpoint wrappers</td>
|
||
<td>Auto-handled — <code>find_unused_parameters</code> is skipped when <code>activation_offloading: true</code></td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td>Loss ~12 instead of ~0.5</td>
|
||
<td>Using <code>lora_target_linear: true</code> (applies LoRA to vision/audio modules)</td>
|
||
<td>Use the regex <code>lora_target_modules</code> pattern instead</td>
|
||
</tr>
|
||
<tr class="even">
|
||
<td>FSDP2 <code>Could not find Gemma4AudioLayer</code></td>
|
||
<td>Auto-wrap detects <code>_no_split_modules</code> including audio layers that don’t exist</td>
|
||
<td>Explicitly set <code>fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer</code></td>
|
||
</tr>
|
||
<tr class="odd">
|
||
<td><code>Gemma4ClippableLinear not supported</code> by PEFT</td>
|
||
<td>Vision tower uses a non-standard linear wrapper</td>
|
||
<td>Axolotl patches this automatically via <code>_patch_peft_clippable_linear()</code></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</section>
|
||
<section id="e2be4b-dense-models" class="level3">
|
||
<h3 class="anchored" data-anchor-id="e2be4b-dense-models">E2B/E4B dense models</h3>
|
||
<p>These have <code>hidden_size_per_layer_input: 256</code> (per-layer input embeddings) and <code>attention_k_eq_v: False</code>. Known issue: loss starts higher than expected (~12 vs ~0.5 for 26B). Root cause under investigation — may be related to the per-layer input mechanism or the <code>Gemma4ForConditionalGeneration</code> loss computation.</p>
|
||
</section>
|
||
</section>
|
||
<section id="gemma-3" class="level2">
|
||
<h2 class="anchored" data-anchor-id="gemma-3">Gemma 3</h2>
|
||
<p><strong>Models</strong>: <code>google/gemma-3-*</code></p>
|
||
<ul>
|
||
<li><code>ddp_find_unused_parameters: true</code> needed (multimodal unused params)</li>
|
||
<li><code>use_reentrant: false</code> recommended</li>
|
||
<li>Attention mask must be dropped for sample packing (handled automatically)</li>
|
||
<li>Multi-GPU test currently skipped (<code>tests/e2e/multigpu/test_gemma3.py</code>)</li>
|
||
</ul>
|
||
</section>
|
||
<section id="qwen-3.5-moe" class="level2">
|
||
<h2 class="anchored" data-anchor-id="qwen-3.5-moe">Qwen 3.5 MoE</h2>
|
||
<p><strong>Models</strong>: <code>Qwen/Qwen3.5-35B-A3B</code></p>
|
||
<ul>
|
||
<li><p>Hybrid architecture: DeltaNet linear attention (30 layers) + full attention (10 layers)</p></li>
|
||
<li><p>256 experts, 8 active per token</p></li>
|
||
<li><p>Known weight scale drift in late DeltaNet layers (36-38) due to AdamW + rare expert interaction</p></li>
|
||
<li><p>Fix: <code>normalize_weight_scales</code> config to detect and rescale outliers:</p>
|
||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb12"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="fu">normalize_weight_scales</span><span class="kw">:</span></span>
|
||
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">name_pattern</span><span class="kw">:</span><span class="at"> </span><span class="st">'linear_attn\.conv1d\.weight'</span></span>
|
||
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">threshold</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.3</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
|
||
</ul>
|
||
</section>
|
||
<section id="general-moe-notes" class="level2">
|
||
<h2 class="anchored" data-anchor-id="general-moe-notes">General MoE Notes</h2>
|
||
<ul>
|
||
<li><code>lora_target_linear: true</code> with multimodal MoE models will apply LoRA to ALL linear modules including vision/audio encoders — use regex <code>lora_target_modules</code> to restrict to language model only</li>
|
||
<li>Rare experts get larger effective learning rate from AdamW (small second-moment estimates) — can cause weight drift in recurrent/SSM components. Use <code>normalize_weight_scales</code> with <code>dry_run: true</code> to detect.</li>
|
||
<li>For ScatterMoE kernel support, set <code>experts_implementation: scattermoe</code> and add the KernelsPlugin</li>
|
||
</ul>
|
||
|
||
|
||
</section>
|
||
</section>
|
||
|
||
</main> <!-- /main -->
|
||
<script id="quarto-html-after-body" type="application/javascript">
|
||
window.document.addEventListener("DOMContentLoaded", function (event) {
|
||
const icon = "";
|
||
const anchorJS = new window.AnchorJS();
|
||
anchorJS.options = {
|
||
placement: 'right',
|
||
icon: icon
|
||
};
|
||
anchorJS.add('.anchored');
|
||
const isCodeAnnotation = (el) => {
|
||
for (const clz of el.classList) {
|
||
if (clz.startsWith('code-annotation-')) {
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
const onCopySuccess = function(e) {
|
||
// button target
|
||
const button = e.trigger;
|
||
// don't keep focus
|
||
button.blur();
|
||
// flash "checked"
|
||
button.classList.add('code-copy-button-checked');
|
||
var currentTitle = button.getAttribute("title");
|
||
button.setAttribute("title", "Copied!");
|
||
let tooltip;
|
||
if (window.bootstrap) {
|
||
button.setAttribute("data-bs-toggle", "tooltip");
|
||
button.setAttribute("data-bs-placement", "left");
|
||
button.setAttribute("data-bs-title", "Copied!");
|
||
tooltip = new bootstrap.Tooltip(button,
|
||
{ trigger: "manual",
|
||
customClass: "code-copy-button-tooltip",
|
||
offset: [0, -8]});
|
||
tooltip.show();
|
||
}
|
||
setTimeout(function() {
|
||
if (tooltip) {
|
||
tooltip.hide();
|
||
button.removeAttribute("data-bs-title");
|
||
button.removeAttribute("data-bs-toggle");
|
||
button.removeAttribute("data-bs-placement");
|
||
}
|
||
button.setAttribute("title", currentTitle);
|
||
button.classList.remove('code-copy-button-checked');
|
||
}, 1000);
|
||
// clear code selection
|
||
e.clearSelection();
|
||
}
|
||
const getTextToCopy = function(trigger) {
|
||
const outerScaffold = trigger.parentElement.cloneNode(true);
|
||
const codeEl = outerScaffold.querySelector('code');
|
||
for (const childEl of codeEl.children) {
|
||
if (isCodeAnnotation(childEl)) {
|
||
childEl.remove();
|
||
}
|
||
}
|
||
return codeEl.innerText;
|
||
}
|
||
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
|
||
text: getTextToCopy
|
||
});
|
||
clipboard.on('success', onCopySuccess);
|
||
if (window.document.getElementById('quarto-embedded-source-code-modal')) {
|
||
const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
|
||
text: getTextToCopy,
|
||
container: window.document.getElementById('quarto-embedded-source-code-modal')
|
||
});
|
||
clipboardModal.on('success', onCopySuccess);
|
||
}
|
||
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
|
||
var mailtoRegex = new RegExp(/^mailto:/);
|
||
var filterRegex = new RegExp("https:\/\/docs\.axolotl\.ai");
|
||
var isInternal = (href) => {
|
||
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
|
||
}
|
||
// Inspect non-navigation links and adorn them if external
|
||
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
|
||
for (var i=0; i<links.length; i++) {
|
||
const link = links[i];
|
||
if (!isInternal(link.href)) {
|
||
// undo the damage that might have been done by quarto-nav.js in the case of
|
||
// links that we want to consider external
|
||
if (link.dataset.originalHref !== undefined) {
|
||
link.href = link.dataset.originalHref;
|
||
}
|
||
}
|
||
}
|
||
function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
|
||
const config = {
|
||
allowHTML: true,
|
||
maxWidth: 500,
|
||
delay: 100,
|
||
arrow: false,
|
||
appendTo: function(el) {
|
||
return el.parentElement;
|
||
},
|
||
interactive: true,
|
||
interactiveBorder: 10,
|
||
theme: 'quarto',
|
||
placement: 'bottom-start',
|
||
};
|
||
if (contentFn) {
|
||
config.content = contentFn;
|
||
}
|
||
if (onTriggerFn) {
|
||
config.onTrigger = onTriggerFn;
|
||
}
|
||
if (onUntriggerFn) {
|
||
config.onUntrigger = onUntriggerFn;
|
||
}
|
||
window.tippy(el, config);
|
||
}
|
||
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
|
||
for (var i=0; i<noterefs.length; i++) {
|
||
const ref = noterefs[i];
|
||
tippyHover(ref, function() {
|
||
// use id or data attribute instead here
|
||
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
|
||
try { href = new URL(href).hash; } catch {}
|
||
const id = href.replace(/^#\/?/, "");
|
||
const note = window.document.getElementById(id);
|
||
if (note) {
|
||
return note.innerHTML;
|
||
} else {
|
||
return "";
|
||
}
|
||
});
|
||
}
|
||
const xrefs = window.document.querySelectorAll('a.quarto-xref');
|
||
const processXRef = (id, note) => {
|
||
// Strip column container classes
|
||
const stripColumnClz = (el) => {
|
||
el.classList.remove("page-full", "page-columns");
|
||
if (el.children) {
|
||
for (const child of el.children) {
|
||
stripColumnClz(child);
|
||
}
|
||
}
|
||
}
|
||
stripColumnClz(note)
|
||
if (id === null || id.startsWith('sec-')) {
|
||
// Special case sections, only their first couple elements
|
||
const container = document.createElement("div");
|
||
if (note.children && note.children.length > 2) {
|
||
container.appendChild(note.children[0].cloneNode(true));
|
||
for (let i = 1; i < note.children.length; i++) {
|
||
const child = note.children[i];
|
||
if (child.tagName === "P" && child.innerText === "") {
|
||
continue;
|
||
} else {
|
||
container.appendChild(child.cloneNode(true));
|
||
break;
|
||
}
|
||
}
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(container);
|
||
}
|
||
return container.innerHTML
|
||
} else {
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(note);
|
||
}
|
||
return note.innerHTML;
|
||
}
|
||
} else {
|
||
// Remove any anchor links if they are present
|
||
const anchorLink = note.querySelector('a.anchorjs-link');
|
||
if (anchorLink) {
|
||
anchorLink.remove();
|
||
}
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(note);
|
||
}
|
||
if (note.classList.contains("callout")) {
|
||
return note.outerHTML;
|
||
} else {
|
||
return note.innerHTML;
|
||
}
|
||
}
|
||
}
|
||
for (var i=0; i<xrefs.length; i++) {
|
||
const xref = xrefs[i];
|
||
tippyHover(xref, undefined, function(instance) {
|
||
instance.disable();
|
||
let url = xref.getAttribute('href');
|
||
let hash = undefined;
|
||
if (url.startsWith('#')) {
|
||
hash = url;
|
||
} else {
|
||
try { hash = new URL(url).hash; } catch {}
|
||
}
|
||
if (hash) {
|
||
const id = hash.replace(/^#\/?/, "");
|
||
const note = window.document.getElementById(id);
|
||
if (note !== null) {
|
||
try {
|
||
const html = processXRef(id, note.cloneNode(true));
|
||
instance.setContent(html);
|
||
} finally {
|
||
instance.enable();
|
||
instance.show();
|
||
}
|
||
} else {
|
||
// See if we can fetch this
|
||
fetch(url.split('#')[0])
|
||
.then(res => res.text())
|
||
.then(html => {
|
||
const parser = new DOMParser();
|
||
const htmlDoc = parser.parseFromString(html, "text/html");
|
||
const note = htmlDoc.getElementById(id);
|
||
if (note !== null) {
|
||
const html = processXRef(id, note);
|
||
instance.setContent(html);
|
||
}
|
||
}).finally(() => {
|
||
instance.enable();
|
||
instance.show();
|
||
});
|
||
}
|
||
} else {
|
||
// See if we can fetch a full url (with no hash to target)
|
||
// This is a special case and we should probably do some content thinning / targeting
|
||
fetch(url)
|
||
.then(res => res.text())
|
||
.then(html => {
|
||
const parser = new DOMParser();
|
||
const htmlDoc = parser.parseFromString(html, "text/html");
|
||
const note = htmlDoc.querySelector('main.content');
|
||
if (note !== null) {
|
||
// This should only happen for chapter cross references
|
||
// (since there is no id in the URL)
|
||
// remove the first header
|
||
if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
|
||
note.children[0].remove();
|
||
}
|
||
const html = processXRef(null, note);
|
||
instance.setContent(html);
|
||
}
|
||
}).finally(() => {
|
||
instance.enable();
|
||
instance.show();
|
||
});
|
||
}
|
||
}, function(instance) {
|
||
});
|
||
}
|
||
let selectedAnnoteEl;
|
||
const selectorForAnnotation = ( cell, annotation) => {
|
||
let cellAttr = 'data-code-cell="' + cell + '"';
|
||
let lineAttr = 'data-code-annotation="' + annotation + '"';
|
||
const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
|
||
return selector;
|
||
}
|
||
const selectCodeLines = (annoteEl) => {
|
||
const doc = window.document;
|
||
const targetCell = annoteEl.getAttribute("data-target-cell");
|
||
const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
|
||
const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
|
||
const lines = annoteSpan.getAttribute("data-code-lines").split(",");
|
||
const lineIds = lines.map((line) => {
|
||
return targetCell + "-" + line;
|
||
})
|
||
let top = null;
|
||
let height = null;
|
||
let parent = null;
|
||
if (lineIds.length > 0) {
|
||
//compute the position of the single el (top and bottom and make a div)
|
||
const el = window.document.getElementById(lineIds[0]);
|
||
top = el.offsetTop;
|
||
height = el.offsetHeight;
|
||
parent = el.parentElement.parentElement;
|
||
if (lineIds.length > 1) {
|
||
const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
|
||
const bottom = lastEl.offsetTop + lastEl.offsetHeight;
|
||
height = bottom - top;
|
||
}
|
||
if (top !== null && height !== null && parent !== null) {
|
||
// cook up a div (if necessary) and position it
|
||
let div = window.document.getElementById("code-annotation-line-highlight");
|
||
if (div === null) {
|
||
div = window.document.createElement("div");
|
||
div.setAttribute("id", "code-annotation-line-highlight");
|
||
div.style.position = 'absolute';
|
||
parent.appendChild(div);
|
||
}
|
||
div.style.top = top - 2 + "px";
|
||
div.style.height = height + 4 + "px";
|
||
div.style.left = 0;
|
||
let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
|
||
if (gutterDiv === null) {
|
||
gutterDiv = window.document.createElement("div");
|
||
gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
|
||
gutterDiv.style.position = 'absolute';
|
||
const codeCell = window.document.getElementById(targetCell);
|
||
const gutter = codeCell.querySelector('.code-annotation-gutter');
|
||
gutter.appendChild(gutterDiv);
|
||
}
|
||
gutterDiv.style.top = top - 2 + "px";
|
||
gutterDiv.style.height = height + 4 + "px";
|
||
}
|
||
selectedAnnoteEl = annoteEl;
|
||
}
|
||
};
|
||
const unselectCodeLines = () => {
|
||
const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
|
||
elementsIds.forEach((elId) => {
|
||
const div = window.document.getElementById(elId);
|
||
if (div) {
|
||
div.remove();
|
||
}
|
||
});
|
||
selectedAnnoteEl = undefined;
|
||
};
|
||
// Handle positioning of the toggle
|
||
window.addEventListener(
|
||
"resize",
|
||
throttle(() => {
|
||
elRect = undefined;
|
||
if (selectedAnnoteEl) {
|
||
selectCodeLines(selectedAnnoteEl);
|
||
}
|
||
}, 10)
|
||
);
|
||
function throttle(fn, ms) {
|
||
let throttle = false;
|
||
let timer;
|
||
return (...args) => {
|
||
if(!throttle) { // first call gets through
|
||
fn.apply(this, args);
|
||
throttle = true;
|
||
} else { // all the others get throttled
|
||
if(timer) clearTimeout(timer); // cancel #2
|
||
timer = setTimeout(() => {
|
||
fn.apply(this, args);
|
||
timer = throttle = false;
|
||
}, ms);
|
||
}
|
||
};
|
||
}
|
||
// Attach click handler to the DT
|
||
const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
|
||
for (const annoteDlNode of annoteDls) {
|
||
annoteDlNode.addEventListener('click', (event) => {
|
||
const clickedEl = event.target;
|
||
if (clickedEl !== selectedAnnoteEl) {
|
||
unselectCodeLines();
|
||
const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
|
||
if (activeEl) {
|
||
activeEl.classList.remove('code-annotation-active');
|
||
}
|
||
selectCodeLines(clickedEl);
|
||
clickedEl.classList.add('code-annotation-active');
|
||
} else {
|
||
// Unselect the line
|
||
unselectCodeLines();
|
||
clickedEl.classList.remove('code-annotation-active');
|
||
}
|
||
});
|
||
}
|
||
const findCites = (el) => {
|
||
const parentEl = el.parentElement;
|
||
if (parentEl) {
|
||
const cites = parentEl.dataset.cites;
|
||
if (cites) {
|
||
return {
|
||
el,
|
||
cites: cites.split(' ')
|
||
};
|
||
} else {
|
||
return findCites(el.parentElement)
|
||
}
|
||
} else {
|
||
return undefined;
|
||
}
|
||
};
|
||
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
|
||
for (var i=0; i<bibliorefs.length; i++) {
|
||
const ref = bibliorefs[i];
|
||
const citeInfo = findCites(ref);
|
||
if (citeInfo) {
|
||
tippyHover(citeInfo.el, function() {
|
||
var popup = window.document.createElement('div');
|
||
citeInfo.cites.forEach(function(cite) {
|
||
var citeDiv = window.document.createElement('div');
|
||
citeDiv.classList.add('hanging-indent');
|
||
citeDiv.classList.add('csl-entry');
|
||
var biblioDiv = window.document.getElementById('ref-' + cite);
|
||
if (biblioDiv) {
|
||
citeDiv.innerHTML = biblioDiv.innerHTML;
|
||
}
|
||
popup.appendChild(citeDiv);
|
||
});
|
||
return popup.innerHTML;
|
||
});
|
||
}
|
||
}
|
||
});
|
||
</script>
|
||
</div> <!-- /content -->
|
||
|
||
|
||
|
||
|
||
</body></html> |