Files
axolotl/docs/agents/new_model_support.html
Quarto GHA Workflow Runner c7ad3c8e22 Built site for gh-pages
2026-04-22 13:12:17 +00:00

1570 lines
73 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-1.9.37">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<title>new_model_support Axolotl</title>
<style>
/* Default styles provided by pandoc.
** See https://pandoc.org/MANUAL.html#variables-for-html for config info.
*/
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
vertical-align: middle;
}
/* CSS for syntax highlighting */
html { -webkit-text-size-adjust: 100%; }
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
}
pre.numberSource { margin-left: 3em; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
</style>
<script src="../../site_libs/quarto-nav/quarto-nav.js"></script>
<script src="../../site_libs/clipboard/clipboard.min.js"></script>
<script src="../../site_libs/quarto-search/autocomplete.umd.js"></script>
<script src="../../site_libs/quarto-search/fuse.min.js"></script>
<script src="../../site_libs/quarto-search/quarto-search.js"></script>
<meta name="quarto:offset" content="../../">
<link href="../../favicon.jpg" rel="icon" type="image/jpeg">
<script src="../../site_libs/quarto-html/quarto.js" type="module"></script>
<script src="../../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
<script src="../../site_libs/quarto-html/popper.min.js"></script>
<script src="../../site_libs/quarto-html/tippy.umd.min.js"></script>
<script src="../../site_libs/quarto-html/anchor.min.js"></script>
<link href="../../site_libs/quarto-html/tippy.css" rel="stylesheet">
<link href="../../site_libs/quarto-html/quarto-syntax-highlighting-dark-d0ae9245876894da5ac7e18953ecc5cc.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="../../site_libs/bootstrap/bootstrap.min.js"></script>
<link href="../../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="../../site_libs/bootstrap/bootstrap-b7aea7e464dd78f23decae44cf02da44.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
<script id="quarto-search-options" type="application/json">{
"location": "navbar",
"copy-button": false,
"collapse-after": 3,
"panel-placement": "end",
"type": "overlay",
"limit": 50,
"keyboard-shortcut": [
"f",
"/",
"s"
],
"show-item-context": false,
"language": {
"search-no-results-text": "No results",
"search-matching-documents-text": "matching documents",
"search-copy-link-title": "Copy link to search",
"search-hide-matches-text": "Hide additional matches",
"search-more-match-text": "more match in this document",
"search-more-matches-text": "more matches in this document",
"search-clear-button-title": "Clear",
"search-text-placeholder": "",
"search-detached-cancel-button-title": "Cancel",
"search-submit-button-title": "Submit",
"search-label": "Search"
}
}</script>
<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-9KYCVJBNMQ"></script>
<script type="text/javascript">
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
</script>
<link rel="stylesheet" href="../../styles.css">
</head>
<body class="nav-sidebar docked nav-fixed quarto-light">
<div id="quarto-search-results"></div>
<header id="quarto-header" class="headroom fixed-top">
<nav class="navbar navbar-expand " data-bs-theme="dark">
<div class="navbar-container container-fluid">
<div class="navbar-brand-container mx-auto">
<a href="../../index.html" class="navbar-brand navbar-brand-logo">
<img src="../../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo light-content">
<img src="../../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo dark-content">
</a>
</div>
<div class="quarto-navbar-tools tools-wide tools-end">
<a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
<a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
<a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
</div>
<div id="quarto-search" class="" title="Search"></div>
</div> <!-- /container-fluid -->
</nav>
<nav class="quarto-secondary-nav">
<div class="container-fluid d-flex">
<button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
<i class="bi bi-layout-text-sidebar-reverse"></i>
</button>
<nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"></ol></nav>
<a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
</a>
</div>
</nav>
</header>
<!-- content -->
<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
<!-- sidebar -->
<nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
<div class="sidebar-menu-container">
<ul class="list-unstyled mt-1">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../index.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Home</span></a>
</div>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
<span class="menu-text">Getting Started</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/getting-started.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Quickstart</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/choosing_method.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Which Fine-Tuning Method Should I Use?</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/installation.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Installation</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/inference.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Inference and Merging</span></a>
</div>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false">
<span class="menu-text">Model Guides</span></a>
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth2 ">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/kimi-linear.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Kimi Linear</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/plano.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Plano Orchestrator</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/mimo.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">MiMo</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/internvl3_5.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">InternVL 3.5</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/olmo3.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">OLMo 3</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/trinity.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Trinity</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/arcee.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Arcee AFM</span></a>
</div>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false">
<span class="menu-text">Ministral3</span></a>
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth3 ">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/ministral3.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Ministral3</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/ministral3/think.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Ministral 3 Thinking</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/ministral3/vision.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Ministral 3 Vision</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false">
<span class="menu-text">Magistral</span></a>
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth3 ">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/magistral.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Magistral</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/magistral/think.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Magistral Thinking</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/magistral/vision.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Magistral Vision</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/ministral.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Ministral</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/mistral-small.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Mistral Small 3.1/3.2</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/voxtral.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Voxtral</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/devstral.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Devstral</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/mistral.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Mistral 7B</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/llama-4.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Llama 4</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/llama-2.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Llama 2</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/qwen3-next.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Qwen 3 Next</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/qwen3.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Qwen 3</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/gemma3n.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Gemma 3n</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/apertus.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Apertus</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/gpt-oss.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">GPT-OSS</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/seed-oss.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Seed-OSS</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/phi.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Phi</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/smolvlm2.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">SmolVLM 2</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/granite4.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Granite 4</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/LiquidAI.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Liquid Foundation Models 2</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/hunyuan.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Hunyuan</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/jamba.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Jamba</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/models/orpheus.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Orpheus</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/cli.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Command Line Interface (CLI)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/telemetry.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Telemetry</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/config-reference.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Config Reference</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/api" class="sidebar-item-text sidebar-link">
<span class="menu-text">API Reference</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a href="../../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Dataset Formats</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Pre-training</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Instruction Tuning</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Conversation</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Stepwise Supervised Format</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Template-Free</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
<span class="menu-text">Deployments</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/docker.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Docker</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/multi-gpu.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Multi-GPU</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/multi-node.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Multi Node</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Ray Train</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">AMD GPUs on HPC Systems</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/mac.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Mac M-series</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true">
<span class="menu-text">How To Guides</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-7" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/multimodal.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">MultiModal / Vision Language Models (BETA)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/rlhf.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">RLHF (Beta)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/grpo.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">GRPO Training</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/ebft.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">EBFT Training</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/vllm_serving.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">vLLM Serving for GRPO Training</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/reward_modelling.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Reward Modelling</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/lr_groups.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Learning Rate Groups</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/lora_optims.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">LoRA Optimizations</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/dataset_loading.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Dataset Loading</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/qat.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Quantization Aware Training (QAT)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/quantize.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Quantization with torchao</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/optimizations.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Optimizations Guide</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true">
<span class="menu-text">Core Concepts</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-8" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/batch_vs_grad.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Batch size vs Gradient accumulation</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Dataset Preprocessing</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Streaming Datasets</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/multipack.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Multipack (Sample Packing)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/mixed_precision.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Mixed Precision Training</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Optimizers</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Attention</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true">
<span class="menu-text">Advanced Features</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-9" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">FSDP + QLoRA</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/torchao.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">PyTorch ao</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/custom_integrations.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Custom Integrations</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/sequence_parallelism.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Sequence Parallelism</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Gradient Checkpointing, Activation Offloading, and Layer Offloading</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">N-D Parallelism (Beta)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/expert_quantization.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">MoE Expert Quantization</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true">
<span class="menu-text">Troubleshooting</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-10" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/faq.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">FAQ</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/training_stability.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Training Stability &amp; Debugging</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/debugging.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Debugging</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../../docs/nccl.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">NCCL</span></a>
</div>
</li>
</ul>
</li>
</ul>
</div>
</nav>
<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
<!-- margin-sidebar -->
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<nav id="TOC" role="doc-toc" class="toc-active">
<h2 id="toc-title">On this page</h2>
<ul>
<li><a href="#new-model-support-agent-reference" id="toc-new-model-support-agent-reference" class="nav-link active" data-scroll-target="#new-model-support-agent-reference">New Model Support — Agent Reference</a>
<ul class="collapse">
<li><a href="#quick-validation-checklist" id="toc-quick-validation-checklist" class="nav-link" data-scroll-target="#quick-validation-checklist">Quick Validation Checklist</a></li>
<li><a href="#loss-debugging" id="toc-loss-debugging" class="nav-link" data-scroll-target="#loss-debugging">Loss Debugging</a>
<ul class="collapse">
<li><a href="#expected-initial-loss" id="toc-expected-initial-loss" class="nav-link" data-scroll-target="#expected-initial-loss">Expected initial loss</a></li>
<li><a href="#direct-comparison-technique" id="toc-direct-comparison-technique" class="nav-link" data-scroll-target="#direct-comparison-technique">Direct comparison technique</a></li>
<li><a href="#model_accepts_loss_kwargs-inflation" id="toc-model_accepts_loss_kwargs-inflation" class="nav-link" data-scroll-target="#model_accepts_loss_kwargs-inflation"><code>model_accepts_loss_kwargs</code> inflation</a></li>
</ul></li>
<li><a href="#multimodal-models-forconditionalgeneration" id="toc-multimodal-models-forconditionalgeneration" class="nav-link" data-scroll-target="#multimodal-models-forconditionalgeneration">Multimodal Models (ForConditionalGeneration)</a>
<ul class="collapse">
<li><a href="#why-this-matters" id="toc-why-this-matters" class="nav-link" data-scroll-target="#why-this-matters">Why this matters</a></li>
<li><a href="#required-extra-inputs" id="toc-required-extra-inputs" class="nav-link" data-scroll-target="#required-extra-inputs">Required extra inputs</a></li>
<li><a href="#custom-layer-types-and-peft" id="toc-custom-layer-types-and-peft" class="nav-link" data-scroll-target="#custom-layer-types-and-peft">Custom layer types and PEFT</a></li>
</ul></li>
<li><a href="#sample-packing" id="toc-sample-packing" class="nav-link" data-scroll-target="#sample-packing">Sample Packing</a>
<ul class="collapse">
<li><a href="#how-packed-sequence-detection-works-transformers-5.x" id="toc-how-packed-sequence-detection-works-transformers-5.x" class="nav-link" data-scroll-target="#how-packed-sequence-detection-works-transformers-5.x">How packed sequence detection works (transformers ≥ 5.x)</a></li>
<li><a href="#fix-for-models-using-create_causal_mask_mapping" id="toc-fix-for-models-using-create_causal_mask_mapping" class="nav-link" data-scroll-target="#fix-for-models-using-create_causal_mask_mapping">Fix for models using <code>create_causal_mask_mapping</code></a></li>
<li><a href="#models-that-dont-need-this-fix" id="toc-models-that-dont-need-this-fix" class="nav-link" data-scroll-target="#models-that-dont-need-this-fix">Models that DONT need this fix</a></li>
</ul></li>
<li><a href="#attention-backend-selection" id="toc-attention-backend-selection" class="nav-link" data-scroll-target="#attention-backend-selection">Attention Backend Selection</a></li>
<li><a href="#cut-cross-entropy-cce" id="toc-cut-cross-entropy-cce" class="nav-link" data-scroll-target="#cut-cross-entropy-cce">Cut Cross Entropy (CCE)</a>
<ul class="collapse">
<li><a href="#how-cce-patches-work" id="toc-how-cce-patches-work" class="nav-link" data-scroll-target="#how-cce-patches-work">How CCE patches work</a></li>
<li><a href="#adding-cce-for-a-new-model" id="toc-adding-cce-for-a-new-model" class="nav-link" data-scroll-target="#adding-cce-for-a-new-model">Adding CCE for a new model</a></li>
<li><a href="#common-cce-pitfall" id="toc-common-cce-pitfall" class="nav-link" data-scroll-target="#common-cce-pitfall">Common CCE pitfall</a></li>
</ul></li>
<li><a href="#moe-models" id="toc-moe-models" class="nav-link" data-scroll-target="#moe-models">MoE Models</a>
<ul class="collapse">
<li><a href="#dense-mlp-vs-moe-experts" id="toc-dense-mlp-vs-moe-experts" class="nav-link" data-scroll-target="#dense-mlp-vs-moe-experts">Dense MLP vs MoE experts</a></li>
<li><a href="#scattermoe-kernels" id="toc-scattermoe-kernels" class="nav-link" data-scroll-target="#scattermoe-kernels">ScatterMoE kernels</a></li>
</ul></li>
<li><a href="#where-to-add-model-specific-fixes" id="toc-where-to-add-model-specific-fixes" class="nav-link" data-scroll-target="#where-to-add-model-specific-fixes">Where to Add Model-Specific Fixes</a></li>
</ul></li>
</ul>
</nav>
</div>
<!-- main -->
<main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block"></header>
<section id="new-model-support-agent-reference" class="level1">
<h1>New Model Support — Agent Reference</h1>
<p>Guide for debugging and adding support for new model architectures in axolotl. Based on lessons learned from Gemma4, Gemma3, Qwen2-VL, and other multimodal/MoE models.</p>
<section id="quick-validation-checklist" class="level2">
<h2 class="anchored" data-anchor-id="quick-validation-checklist">Quick Validation Checklist</h2>
<p>When testing a new model, run through these checks in order:</p>
<ol type="1">
<li><strong>Does the model load?</strong> <code>axolotl preprocess config.yaml</code> — catches config schema errors</li>
<li><strong>Does LoRA apply?</strong> Check for “Unsupported layer type” warnings from PEFT</li>
<li><strong>Is the initial loss sane?</strong> First-step loss for a pretrained model should be 0.52.0 for SFT</li>
<li><strong>Does sample packing work?</strong> Compare loss with <code>sample_packing: true</code> vs <code>false</code> — should be similar</li>
<li><strong>Is CCE active?</strong> Check for “Applying Cut Cross Entropy” log and verify peak VRAM is lower</li>
</ol>
</section>
<section id="loss-debugging" class="level2">
<h2 class="anchored" data-anchor-id="loss-debugging">Loss Debugging</h2>
<section id="expected-initial-loss" class="level3">
<h3 class="anchored" data-anchor-id="expected-initial-loss">Expected initial loss</h3>
<p>A pretrained model doing SFT should start with loss roughly in the 0.52.0 range. If loss starts above 3.0, something is wrong. If its near <code>log(vocab_size)</code> (≈ 12 for 262K vocab), the model is predicting at random — attention masking or model weights are broken.</p>
</section>
<section id="direct-comparison-technique" class="level3">
<h3 class="anchored" data-anchor-id="direct-comparison-technique">Direct comparison technique</h3>
<p>The fastest way to isolate a loss issue — bypass the trainer entirely:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Load model via axolotl's pipeline (applies all patches)</span></span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> axolotl.cli.config <span class="im">import</span> load_cfg</span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> axolotl.utils.config <span class="im">import</span> normalize_config, prepare_plugins</span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> axolotl.loaders.tokenizer <span class="im">import</span> load_tokenizer</span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> axolotl.loaders.model <span class="im">import</span> ModelLoader</span>
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>cfg <span class="op">=</span> load_cfg(<span class="st">"your_config.yaml"</span>)</span>
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a>normalize_config(cfg)</span>
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a>prepare_plugins(cfg)</span>
<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a>tokenizer <span class="op">=</span> load_tokenizer(cfg)</span>
<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a>model, _ <span class="op">=</span> ModelLoader(cfg, tokenizer).load()</span>
<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Forward pass on preprocessed data</span></span>
<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a>model.train()</span>
<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a>out <span class="op">=</span> model(input_ids, labels<span class="op">=</span>labels)</span>
<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"Direct loss: </span><span class="sc">{</span>out<span class="sc">.</span>loss<span class="sc">.</span>item()<span class="sc">}</span><span class="ss">"</span>) <span class="co"># Compare to trainer's reported loss</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>If direct loss is correct (~1.0) but trainer reports 34x higher, check <code>model_accepts_loss_kwargs</code> (see below).</p>
</section>
<section id="model_accepts_loss_kwargs-inflation" class="level3">
<h3 class="anchored" data-anchor-id="model_accepts_loss_kwargs-inflation"><code>model_accepts_loss_kwargs</code> inflation</h3>
<p>HF Trainer checks if the models <code>forward()</code> has <code>**kwargs</code> and sets <code>model_accepts_loss_kwargs=True</code>. This changes loss normalization: the trainer does NOT divide loss by <code>gradient_accumulation_steps</code> before logging. The gradient is correct — only the logged loss is inflated.</p>
<p><strong>Symptom</strong>: Logged loss ≈ actual_loss × gradient_accumulation_steps.</p>
<p><strong>Which models are affected</strong>: Any model with <code>**kwargs</code> in forward (common in multimodal models for extra inputs like <code>mm_token_type_ids</code>, <code>pixel_values</code>, etc.).</p>
<p><strong>Fix location</strong>: <code>src/axolotl/core/trainers/base.py</code> <code>__init__()</code> — after <code>super().__init__()</code>, check if the unwrapped model actually has <code>num_items_in_batch</code> in its forward signature. If not, set <code>self.model_accepts_loss_kwargs = False</code>.</p>
</section>
</section>
<section id="multimodal-models-forconditionalgeneration" class="level2">
<h2 class="anchored" data-anchor-id="multimodal-models-forconditionalgeneration">Multimodal Models (ForConditionalGeneration)</h2>
<p>Many recent models use <code>ForConditionalGeneration</code> as the top-level class, not <code>ForCausalLM</code>:
- Gemma3 → <code>Gemma3ForConditionalGeneration</code>
- Gemma4 → <code>Gemma4ForConditionalGeneration</code>
- Qwen2-VL → <code>Qwen2VLForConditionalGeneration</code>
- LLaVA → <code>LlavaForConditionalGeneration</code></p>
<section id="why-this-matters" class="level3">
<h3 class="anchored" data-anchor-id="why-this-matters">Why this matters</h3>
<table class="caption-top table">
<thead>
<tr class="header">
<th>Component</th>
<th>Targets <code>ForCausalLM</code></th>
<th>Needs <code>ForConditionalGeneration</code></th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>CCE patches</td>
<td>✅ (default)</td>
<td>❌ silently inactive if not patched</td>
</tr>
<tr class="even">
<td>PEFT LoRA</td>
<td></td>
<td>May fail on custom layer types</td>
</tr>
<tr class="odd">
<td>HF Trainer label handling</td>
<td></td>
<td>May need extra inputs</td>
</tr>
</tbody>
</table>
</section>
<section id="required-extra-inputs" class="level3">
<h3 class="anchored" data-anchor-id="required-extra-inputs">Required extra inputs</h3>
<p>Multimodal models require special inputs during training even for text-only data:</p>
<table class="caption-top table">
<thead>
<tr class="header">
<th>Model</th>
<th>Required Input</th>
<th>Value for Text-Only</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Gemma4</td>
<td><code>mm_token_type_ids</code></td>
<td><code>torch.zeros_like(input_ids)</code></td>
</tr>
<tr class="even">
<td>Gemma3</td>
<td><code>token_type_ids</code></td>
<td><code>torch.zeros_like(input_ids)</code></td>
</tr>
</tbody>
</table>
<p>Auto-inject in <code>compute_loss()</code> when not provided by the data collator. See <code>core/trainers/base.py</code>.</p>
</section>
<section id="custom-layer-types-and-peft" class="level3">
<h3 class="anchored" data-anchor-id="custom-layer-types-and-peft">Custom layer types and PEFT</h3>
<p>Vision towers often use custom module wrappers that PEFT doesnt support:</p>
<table class="caption-top table">
<colgroup>
<col style="width: 21%">
<col style="width: 40%">
<col style="width: 21%">
<col style="width: 15%">
</colgroup>
<thead>
<tr class="header">
<th>Model</th>
<th>Custom Layer</th>
<th>Wraps</th>
<th>Fix</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Gemma4</td>
<td><code>Gemma4ClippableLinear</code></td>
<td><code>nn.Linear</code></td>
<td>Redirect to <code>.linear</code> child</td>
</tr>
</tbody>
</table>
<p>Fix location: <code>src/axolotl/loaders/adapter.py</code> <code>_patch_peft_clippable_linear()</code>.</p>
</section>
</section>
<section id="sample-packing" class="level2">
<h2 class="anchored" data-anchor-id="sample-packing">Sample Packing</h2>
<section id="how-packed-sequence-detection-works-transformers-5.x" class="level3">
<h3 class="anchored" data-anchor-id="how-packed-sequence-detection-works-transformers-5.x">How packed sequence detection works (transformers ≥ 5.x)</h3>
<p><code>transformers.masking_utils._preprocess_mask_arguments()</code> detects packed sequences from <code>position_ids</code> resets. But <strong>only when <code>attention_mask is None</code></strong>:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># From masking_utils.py:</span></span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> position_ids <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span> <span class="kw">and</span> attention_mask <span class="kw">is</span> <span class="va">None</span> <span class="kw">and</span> past_key_values <span class="kw">is</span> <span class="va">None</span>:</span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a> packed_sequence_mask <span class="op">=</span> find_packed_sequence_indices(position_ids)</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>If the collator provides an all-ones <code>attention_mask</code>, packing detection is <strong>skipped</strong> and the model builds a single causal mask spanning all packed sequences → cross-sequence attention leakage → very high loss.</p>
</section>
<section id="fix-for-models-using-create_causal_mask_mapping" class="level3">
<h3 class="anchored" data-anchor-id="fix-for-models-using-create_causal_mask_mapping">Fix for models using <code>create_causal_mask_mapping</code></h3>
<p>For Gemma3, Gemma4, and similar models that use the new transformers masking system, remove <code>attention_mask</code> from inputs when sample packing is active:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># In compute_loss():</span></span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> (</span>
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a> <span class="va">self</span>.args.sample_packing</span>
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a> <span class="kw">and</span> model_type <span class="kw">in</span> (<span class="st">"gemma4"</span>, <span class="st">"gemma3"</span>)</span>
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a> <span class="kw">and</span> <span class="st">"attention_mask"</span> <span class="kw">in</span> inputs</span>
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a> <span class="kw">and</span> <span class="st">"position_ids"</span> <span class="kw">in</span> inputs</span>
<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>):</span>
<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a> <span class="kw">del</span> inputs[<span class="st">"attention_mask"</span>]</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>Fix location: <code>src/axolotl/core/trainers/base.py</code> <code>compute_loss()</code>.</p>
</section>
<section id="models-that-dont-need-this-fix" class="level3">
<h3 class="anchored" data-anchor-id="models-that-dont-need-this-fix">Models that DONT need this fix</h3>
<p>Older models that use <code>_prepare_4d_causal_attention_mask</code> (Llama, Mistral, Qwen2, etc.) handle sample packing via axolotls multipack attention monkeypatch instead. Only models using the new <code>create_causal_mask_mapping</code> / <code>create_causal_mask</code> masking system need the <code>attention_mask</code> removal.</p>
</section>
</section>
<section id="attention-backend-selection" class="level2">
<h2 class="anchored" data-anchor-id="attention-backend-selection">Attention Backend Selection</h2>
<table class="caption-top table">
<colgroup>
<col style="width: 16%">
<col style="width: 14%">
<col style="width: 27%">
<col style="width: 27%">
<col style="width: 12%">
</colgroup>
<thead>
<tr class="header">
<th>Backend</th>
<th>Config</th>
<th>head_dim limit</th>
<th>torch_compile</th>
<th>Notes</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>FA2</td>
<td><code>flash_attention: true</code></td>
<td>256</td>
<td></td>
<td>Fastest when supported</td>
</tr>
<tr class="even">
<td>FA4</td>
<td>auto with <code>flash_attention: true</code></td>
<td>256 (SM90+)</td>
<td></td>
<td>Auto-detected on H100+</td>
</tr>
<tr class="odd">
<td>SDPA</td>
<td><code>sdp_attention: true</code></td>
<td>None</td>
<td></td>
<td>Universal fallback</td>
</tr>
<tr class="even">
<td>flex</td>
<td><code>flex_attention: true</code></td>
<td>None</td>
<td>⚠️ Triton OOM for large head_dim</td>
<td>Good for variable head dims</td>
</tr>
<tr class="odd">
<td>eager</td>
<td>neither set</td>
<td>None</td>
<td></td>
<td>Slowest, always works</td>
</tr>
</tbody>
</table>
<p><strong>Check model support</strong>: Look at <code>_supports_flash_attn_2</code>, <code>_supports_flex_attn</code>, <code>_supports_sdpa</code> attributes on the model class.</p>
<p><strong>head_dim gotcha</strong>: The 256 limit is specific to flash-attn CUDA kernels, NOT PyTorch-level. SDPA and flex_attention both handle arbitrary head_dim. Models with <code>global_head_dim &gt; 256</code> (Gemma4: 512) must use SDPA or flex.</p>
<p><strong>flex + compile gotcha</strong>: <code>torch_compile</code> with flex_attention can hit Triton shared memory OOM for large head_dim. Falls back to eager per-function (not a crash, but slower). Unsloth disables flex for Gemma4 for this reason.</p>
</section>
<section id="cut-cross-entropy-cce" class="level2">
<h2 class="anchored" data-anchor-id="cut-cross-entropy-cce">Cut Cross Entropy (CCE)</h2>
<section id="how-cce-patches-work" class="level3">
<h3 class="anchored" data-anchor-id="how-cce-patches-work">How CCE patches work</h3>
<p>CCE replaces the models <code>forward()</code> with a fused version that computes loss from hidden states + lm_head weight without materializing the full logits tensor. This saves ~<code>batch × seq_len × vocab_size × dtype_bytes</code> of VRAM.</p>
</section>
<section id="adding-cce-for-a-new-model" class="level3">
<h3 class="anchored" data-anchor-id="adding-cce-for-a-new-model">Adding CCE for a new model</h3>
<ol type="1">
<li>Check if the model type is in <code>cut_cross_entropy.transformers.patch.PATCH_FNS</code></li>
<li>If not, axolotls generic fallback (<code>integrations/cut_cross_entropy/__init__.py</code> <code>patch_llama_like()</code>) patches <code>{Prefix}ForCausalLM.forward</code> with <code>cce_forward</code></li>
<li>For multimodal models (<code>ForConditionalGeneration</code>), a model-specific patch is needed in <code>ml-cross-entropy</code> repo</li>
<li>The multimodal <code>cce_forward</code> must accept all extra kwargs (pixel_values, mm_token_type_ids, etc.) and pop any that would conflict before calling <code>self.model()</code></li>
</ol>
</section>
<section id="common-cce-pitfall" class="level3">
<h3 class="anchored" data-anchor-id="common-cce-pitfall">Common CCE pitfall</h3>
<p>If CCE appears active (log says “Applying Cut Cross Entropy”) but peak VRAM doesnt decrease, check which class was patched. If the model loads as <code>ForConditionalGeneration</code> but CCE patched <code>ForCausalLM</code>, the patch is silently inactive.</p>
</section>
</section>
<section id="moe-models" class="level2">
<h2 class="anchored" data-anchor-id="moe-models">MoE Models</h2>
<section id="dense-mlp-vs-moe-experts" class="level3">
<h3 class="anchored" data-anchor-id="dense-mlp-vs-moe-experts">Dense MLP vs MoE experts</h3>
<p>Some MoE models (e.g., Gemma4) have BOTH dense MLP layers and MoE expert layers at every decoder layer:
- <code>gate_proj/up_proj/down_proj</code> → targets the <strong>dense MLP</strong> (<code>Gemma4TextMLP</code>)
- <code>experts.gate_up_proj/experts.down_proj</code> → targets the <strong>MoE experts</strong> (<code>Gemma4TextExperts</code>)</p>
<p>LoRA on the dense MLP works normally. Expert LoRA via <code>lora_target_parameters</code> requires PEFT support for the specific expert module type (may warn “Unsupported layer type”).</p>
</section>
<section id="scattermoe-kernels" class="level3">
<h3 class="anchored" data-anchor-id="scattermoe-kernels">ScatterMoE kernels</h3>
<p><code>use_scattermoe: true</code> with <code>experts_implementation: scattermoe</code> registers fused expert kernels via transformers <code>ExpertsInterface</code>. Significant speedup for MoE models. Requires the kernels plugin:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span></span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> axolotl.integrations.kernels.KernelsPlugin</span></span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="fu">use_scattermoe</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="fu">experts_implementation</span><span class="kw">:</span><span class="at"> scattermoe</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
</section>
<section id="where-to-add-model-specific-fixes" class="level2">
<h2 class="anchored" data-anchor-id="where-to-add-model-specific-fixes">Where to Add Model-Specific Fixes</h2>
<table class="caption-top table">
<colgroup>
<col style="width: 27%">
<col style="width: 31%">
<col style="width: 40%">
</colgroup>
<thead>
<tr class="header">
<th>What</th>
<th>Where</th>
<th>Example</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td>Missing forward inputs</td>
<td><code>core/trainers/base.py</code> <code>compute_loss()</code></td>
<td>mm_token_type_ids injection</td>
</tr>
<tr class="even">
<td>Attention mask fixes</td>
<td><code>core/trainers/base.py</code> <code>compute_loss()</code></td>
<td>Sample packing mask removal</td>
</tr>
<tr class="odd">
<td>Loss logging fixes</td>
<td><code>core/trainers/base.py</code> <code>__init__()</code></td>
<td>model_accepts_loss_kwargs override</td>
</tr>
<tr class="even">
<td>PEFT/LoRA patches</td>
<td><code>loaders/adapter.py</code></td>
<td>ClippableLinear redirect</td>
</tr>
<tr class="odd">
<td>Attention patches</td>
<td><code>monkeypatch/attention/</code></td>
<td>FA4 tuple fix</td>
</tr>
<tr class="even">
<td>Model-specific patches</td>
<td><code>loaders/patch_manager.py</code> <code>_apply_model_specific_patches()</code></td>
<td>Llama4, Kimi, NemotronH</td>
</tr>
<tr class="odd">
<td>CCE patches</td>
<td><code>ml-cross-entropy</code> repo <code>transformers/</code></td>
<td>Per-model cce_forward</td>
</tr>
<tr class="even">
<td>Example configs</td>
<td><code>examples/&lt;model&gt;/</code></td>
<td>Validated YAML</td>
</tr>
<tr class="odd">
<td>Config validation</td>
<td><code>utils/schemas/validation.py</code></td>
<td>Compatibility checks</td>
</tr>
</tbody>
</table>
</section>
</section>
</main> <!-- /main -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const isCodeAnnotation = (el) => {
for (const clz of el.classList) {
if (clz.startsWith('code-annotation-')) {
return true;
}
}
return false;
}
const onCopySuccess = function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
let tooltip;
if (window.bootstrap) {
button.setAttribute("data-bs-toggle", "tooltip");
button.setAttribute("data-bs-placement", "left");
button.setAttribute("data-bs-title", "Copied!");
tooltip = new bootstrap.Tooltip(button,
{ trigger: "manual",
customClass: "code-copy-button-tooltip",
offset: [0, -8]});
tooltip.show();
}
setTimeout(function() {
if (tooltip) {
tooltip.hide();
button.removeAttribute("data-bs-title");
button.removeAttribute("data-bs-toggle");
button.removeAttribute("data-bs-placement");
}
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
}
const getTextToCopy = function(trigger) {
const outerScaffold = trigger.parentElement.cloneNode(true);
const codeEl = outerScaffold.querySelector('code');
for (const childEl of codeEl.children) {
if (isCodeAnnotation(childEl)) {
childEl.remove();
}
}
return codeEl.innerText;
}
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
text: getTextToCopy
});
clipboard.on('success', onCopySuccess);
if (window.document.getElementById('quarto-embedded-source-code-modal')) {
const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
text: getTextToCopy,
container: window.document.getElementById('quarto-embedded-source-code-modal')
});
clipboardModal.on('success', onCopySuccess);
}
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
var mailtoRegex = new RegExp(/^mailto:/);
var filterRegex = new RegExp("https:\/\/docs\.axolotl\.ai");
var isInternal = (href) => {
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
}
// Inspect non-navigation links and adorn them if external
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
for (var i=0; i<links.length; i++) {
const link = links[i];
if (!isInternal(link.href)) {
// undo the damage that might have been done by quarto-nav.js in the case of
// links that we want to consider external
if (link.dataset.originalHref !== undefined) {
link.href = link.dataset.originalHref;
}
}
}
function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
const config = {
allowHTML: true,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start',
};
if (contentFn) {
config.content = contentFn;
}
if (onTriggerFn) {
config.onTrigger = onTriggerFn;
}
if (onUntriggerFn) {
config.onUntrigger = onUntriggerFn;
}
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {
const ref = noterefs[i];
tippyHover(ref, function() {
// use id or data attribute instead here
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
if (note) {
return note.innerHTML;
} else {
return "";
}
});
}
const xrefs = window.document.querySelectorAll('a.quarto-xref');
const processXRef = (id, note) => {
// Strip column container classes
const stripColumnClz = (el) => {
el.classList.remove("page-full", "page-columns");
if (el.children) {
for (const child of el.children) {
stripColumnClz(child);
}
}
}
stripColumnClz(note)
if (id === null || id.startsWith('sec-')) {
// Special case sections, only their first couple elements
const container = document.createElement("div");
if (note.children && note.children.length > 2) {
container.appendChild(note.children[0].cloneNode(true));
for (let i = 1; i < note.children.length; i++) {
const child = note.children[i];
if (child.tagName === "P" && child.innerText === "") {
continue;
} else {
container.appendChild(child.cloneNode(true));
break;
}
}
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(container);
}
return container.innerHTML
} else {
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(note);
}
return note.innerHTML;
}
} else {
// Remove any anchor links if they are present
const anchorLink = note.querySelector('a.anchorjs-link');
if (anchorLink) {
anchorLink.remove();
}
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(note);
}
if (note.classList.contains("callout")) {
return note.outerHTML;
} else {
return note.innerHTML;
}
}
}
for (var i=0; i<xrefs.length; i++) {
const xref = xrefs[i];
tippyHover(xref, undefined, function(instance) {
instance.disable();
let url = xref.getAttribute('href');
let hash = undefined;
if (url.startsWith('#')) {
hash = url;
} else {
try { hash = new URL(url).hash; } catch {}
}
if (hash) {
const id = hash.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
if (note !== null) {
try {
const html = processXRef(id, note.cloneNode(true));
instance.setContent(html);
} finally {
instance.enable();
instance.show();
}
} else {
// See if we can fetch this
fetch(url.split('#')[0])
.then(res => res.text())
.then(html => {
const parser = new DOMParser();
const htmlDoc = parser.parseFromString(html, "text/html");
const note = htmlDoc.getElementById(id);
if (note !== null) {
const html = processXRef(id, note);
instance.setContent(html);
}
}).finally(() => {
instance.enable();
instance.show();
});
}
} else {
// See if we can fetch a full url (with no hash to target)
// This is a special case and we should probably do some content thinning / targeting
fetch(url)
.then(res => res.text())
.then(html => {
const parser = new DOMParser();
const htmlDoc = parser.parseFromString(html, "text/html");
const note = htmlDoc.querySelector('main.content');
if (note !== null) {
// This should only happen for chapter cross references
// (since there is no id in the URL)
// remove the first header
if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
note.children[0].remove();
}
const html = processXRef(null, note);
instance.setContent(html);
}
}).finally(() => {
instance.enable();
instance.show();
});
}
}, function(instance) {
});
}
let selectedAnnoteEl;
const selectorForAnnotation = ( cell, annotation) => {
let cellAttr = 'data-code-cell="' + cell + '"';
let lineAttr = 'data-code-annotation="' + annotation + '"';
const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
return selector;
}
const selectCodeLines = (annoteEl) => {
const doc = window.document;
const targetCell = annoteEl.getAttribute("data-target-cell");
const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
const lines = annoteSpan.getAttribute("data-code-lines").split(",");
const lineIds = lines.map((line) => {
return targetCell + "-" + line;
})
let top = null;
let height = null;
let parent = null;
if (lineIds.length > 0) {
//compute the position of the single el (top and bottom and make a div)
const el = window.document.getElementById(lineIds[0]);
top = el.offsetTop;
height = el.offsetHeight;
parent = el.parentElement.parentElement;
if (lineIds.length > 1) {
const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
const bottom = lastEl.offsetTop + lastEl.offsetHeight;
height = bottom - top;
}
if (top !== null && height !== null && parent !== null) {
// cook up a div (if necessary) and position it
let div = window.document.getElementById("code-annotation-line-highlight");
if (div === null) {
div = window.document.createElement("div");
div.setAttribute("id", "code-annotation-line-highlight");
div.style.position = 'absolute';
parent.appendChild(div);
}
div.style.top = top - 2 + "px";
div.style.height = height + 4 + "px";
div.style.left = 0;
let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
if (gutterDiv === null) {
gutterDiv = window.document.createElement("div");
gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
gutterDiv.style.position = 'absolute';
const codeCell = window.document.getElementById(targetCell);
const gutter = codeCell.querySelector('.code-annotation-gutter');
gutter.appendChild(gutterDiv);
}
gutterDiv.style.top = top - 2 + "px";
gutterDiv.style.height = height + 4 + "px";
}
selectedAnnoteEl = annoteEl;
}
};
const unselectCodeLines = () => {
const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
elementsIds.forEach((elId) => {
const div = window.document.getElementById(elId);
if (div) {
div.remove();
}
});
selectedAnnoteEl = undefined;
};
// Handle positioning of the toggle
window.addEventListener(
"resize",
throttle(() => {
elRect = undefined;
if (selectedAnnoteEl) {
selectCodeLines(selectedAnnoteEl);
}
}, 10)
);
function throttle(fn, ms) {
let throttle = false;
let timer;
return (...args) => {
if(!throttle) { // first call gets through
fn.apply(this, args);
throttle = true;
} else { // all the others get throttled
if(timer) clearTimeout(timer); // cancel #2
timer = setTimeout(() => {
fn.apply(this, args);
timer = throttle = false;
}, ms);
}
};
}
// Attach click handler to the DT
const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
for (const annoteDlNode of annoteDls) {
annoteDlNode.addEventListener('click', (event) => {
const clickedEl = event.target;
if (clickedEl !== selectedAnnoteEl) {
unselectCodeLines();
const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
if (activeEl) {
activeEl.classList.remove('code-annotation-active');
}
selectCodeLines(clickedEl);
clickedEl.classList.add('code-annotation-active');
} else {
// Unselect the line
unselectCodeLines();
clickedEl.classList.remove('code-annotation-active');
}
});
}
const findCites = (el) => {
const parentEl = el.parentElement;
if (parentEl) {
const cites = parentEl.dataset.cites;
if (cites) {
return {
el,
cites: cites.split(' ')
};
} else {
return findCites(el.parentElement)
}
} else {
return undefined;
}
};
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
for (var i=0; i<bibliorefs.length; i++) {
const ref = bibliorefs[i];
const citeInfo = findCites(ref);
if (citeInfo) {
tippyHover(citeInfo.el, function() {
var popup = window.document.createElement('div');
citeInfo.cites.forEach(function(cite) {
var citeDiv = window.document.createElement('div');
citeDiv.classList.add('hanging-indent');
citeDiv.classList.add('csl-entry');
var biblioDiv = window.document.getElementById('ref-' + cite);
if (biblioDiv) {
citeDiv.innerHTML = biblioDiv.innerHTML;
}
popup.appendChild(citeDiv);
});
return popup.innerHTML;
});
}
}
});
</script>
</div> <!-- /content -->
</body></html>