Files
axolotl/docs/rlhf.html
Quarto GHA Workflow Runner 1d736a2a49 Built site for gh-pages
2026-03-25 15:23:28 +00:00

2680 lines
243 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-1.9.36">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<meta name="description" content="Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback.">
<title>RLHF (Beta) Axolotl</title>
<style>
/* Default styles provided by pandoc.
** See https://pandoc.org/MANUAL.html#variables-for-html for config info.
*/
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
vertical-align: middle;
}
/* CSS for syntax highlighting */
html { -webkit-text-size-adjust: 100%; }
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
}
pre.numberSource { margin-left: 3em; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
</style>
<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
<script src="../site_libs/clipboard/clipboard.min.js"></script>
<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
<script src="../site_libs/quarto-search/fuse.min.js"></script>
<script src="../site_libs/quarto-search/quarto-search.js"></script>
<meta name="quarto:offset" content="../">
<link href="../favicon.jpg" rel="icon" type="image/jpeg">
<script src="../site_libs/quarto-html/quarto.js" type="module"></script>
<script src="../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
<script src="../site_libs/quarto-html/popper.min.js"></script>
<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
<script src="../site_libs/quarto-html/anchor.min.js"></script>
<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
<link href="../site_libs/quarto-html/quarto-syntax-highlighting-dark-f418161beb48e0141c760e455f12af2c.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="../site_libs/bootstrap/bootstrap-f15b14cef494beb09422a8174b542cad.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
<script id="quarto-search-options" type="application/json">{
"location": "navbar",
"copy-button": false,
"collapse-after": 3,
"panel-placement": "end",
"type": "overlay",
"limit": 50,
"keyboard-shortcut": [
"f",
"/",
"s"
],
"show-item-context": false,
"language": {
"search-no-results-text": "No results",
"search-matching-documents-text": "matching documents",
"search-copy-link-title": "Copy link to search",
"search-hide-matches-text": "Hide additional matches",
"search-more-match-text": "more match in this document",
"search-more-matches-text": "more matches in this document",
"search-clear-button-title": "Clear",
"search-text-placeholder": "",
"search-detached-cancel-button-title": "Cancel",
"search-submit-button-title": "Submit",
"search-label": "Search"
}
}</script>
<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-9KYCVJBNMQ"></script>
<script type="text/javascript">
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
</script>
<link rel="stylesheet" href="../styles.css">
</head>
<body class="nav-sidebar docked nav-fixed quarto-light">
<div id="quarto-search-results"></div>
<header id="quarto-header" class="headroom fixed-top">
<nav class="navbar navbar-expand " data-bs-theme="dark">
<div class="navbar-container container-fluid">
<div class="navbar-brand-container mx-auto">
<a href="../index.html" class="navbar-brand navbar-brand-logo">
<img src="../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo light-content">
<img src="../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo dark-content">
</a>
</div>
<div class="quarto-navbar-tools tools-wide tools-end">
<a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
<a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
<a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
</div>
<div id="quarto-search" class="" title="Search"></div>
</div> <!-- /container-fluid -->
</nav>
<nav class="quarto-secondary-nav">
<div class="container-fluid d-flex">
<button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
<i class="bi bi-layout-text-sidebar-reverse"></i>
</button>
<nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/multimodal.html">How To Guides</a></li><li class="breadcrumb-item"><a href="../docs/rlhf.html">RLHF (Beta)</a></li></ol></nav>
<a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
</a>
</div>
</nav>
</header>
<!-- content -->
<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
<!-- sidebar -->
<nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
<div class="sidebar-menu-container">
<ul class="list-unstyled mt-1">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../index.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Home</span></a>
</div>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
<span class="menu-text">Getting Started</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/getting-started.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Quickstart</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/installation.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Installation</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/inference.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Inference and Merging</span></a>
</div>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false">
<span class="menu-text">Model Guides</span></a>
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth2 ">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/kimi-linear.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Kimi Linear</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/plano.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Plano Orchestrator</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/mimo.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">MiMo</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/internvl3_5.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">InternVL 3.5</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/olmo3.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">OLMo 3</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/trinity.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Trinity</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/arcee.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Arcee AFM</span></a>
</div>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false">
<span class="menu-text">Ministral3</span></a>
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth3 ">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/ministral3.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Ministral3</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/ministral3/think.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Ministral 3 Thinking</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/ministral3/vision.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Ministral 3 Vision</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false">
<span class="menu-text">Magistral</span></a>
<a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth3 ">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/magistral.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Magistral</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/magistral/think.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Magistral Thinking</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/magistral/vision.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Magistral Vision</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/ministral.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Ministral</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/mistral-small.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Mistral Small 3.1/3.2</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/voxtral.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Voxtral</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/devstral.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Devstral</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/mistral.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Mistral 7B</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/llama-4.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Llama 4</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/llama-2.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Llama 2</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/qwen3-next.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Qwen 3 Next</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/qwen3.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Qwen 3</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/gemma3n.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Gemma 3n</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/apertus.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Apertus</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/gpt-oss.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">GPT-OSS</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/seed-oss.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Seed-OSS</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/phi.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Phi</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/smolvlm2.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">SmolVLM 2</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/granite4.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Granite 4</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/LiquidAI.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Liquid Foundation Models 2</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/hunyuan.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Hunyuan</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/jamba.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Jamba</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/models/orpheus.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Orpheus</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/cli.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Command Line Interface (CLI)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/telemetry.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Telemetry</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/config-reference.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Config Reference</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/api" class="sidebar-item-text sidebar-link">
<span class="menu-text">API Reference</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a href="../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Dataset Formats</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Pre-training</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Instruction Tuning</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Conversation</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Stepwise Supervised Format</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Template-Free</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
<span class="menu-text">Deployments</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/docker.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Docker</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/multi-gpu.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Multi-GPU</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/multi-node.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Multi Node</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Ray Train</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">AMD GPUs on HPC Systems</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/mac.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Mac M-series</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true">
<span class="menu-text">How To Guides</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-7" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/multimodal.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">MultiModal / Vision Language Models (BETA)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/rlhf.html" class="sidebar-item-text sidebar-link active">
<span class="menu-text">RLHF (Beta)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/reward_modelling.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Reward Modelling</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/lr_groups.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Learning Rate Groups</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/lora_optims.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">LoRA Optimizations</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/dataset_loading.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Dataset Loading</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/qat.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Quantization Aware Training (QAT)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/quantize.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Quantization with torchao</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/optimizations.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Optimizations Guide</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true">
<span class="menu-text">Core Concepts</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-8" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/batch_vs_grad.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Batch size vs Gradient accumulation</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Dataset Preprocessing</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Streaming Datasets</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/multipack.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Multipack (Sample Packing)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/mixed_precision.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Mixed Precision Training</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Optimizers</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Attention</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true">
<span class="menu-text">Advanced Features</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-9" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">FSDP + QLoRA</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Unsloth</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/torchao.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">PyTorch ao</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/custom_integrations.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Custom Integrations</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/sequence_parallelism.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Sequence Parallelism</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Gradient Checkpointing, Activation Offloading, and Layer Offloading</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">N-D Parallelism (Beta)</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/expert_quantization.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">MoE Expert Quantization</span></a>
</div>
</li>
</ul>
</li>
<li class="sidebar-item sidebar-item-section">
<div class="sidebar-item-container">
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true">
<span class="menu-text">Troubleshooting</span></a>
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true" aria-label="Toggle section">
<i class="bi bi-chevron-right ms-2"></i>
</a>
</div>
<ul id="quarto-sidebar-section-10" class="collapse list-unstyled sidebar-section depth1 show">
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/faq.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">FAQ</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/debugging.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">Debugging</span></a>
</div>
</li>
<li class="sidebar-item">
<div class="sidebar-item-container">
<a href="../docs/nccl.html" class="sidebar-item-text sidebar-link">
<span class="menu-text">NCCL</span></a>
</div>
</li>
</ul>
</li>
</ul>
</div>
</nav>
<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
<!-- margin-sidebar -->
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<nav id="TOC" role="doc-toc" class="toc-active" data-toc-expanded="2">
<h2 id="toc-title">On this page</h2>
<ul>
<li><a href="#overview" id="toc-overview" class="nav-link active" data-scroll-target="#overview">Overview</a></li>
<li><a href="#rlhf-using-axolotl" id="toc-rlhf-using-axolotl" class="nav-link" data-scroll-target="#rlhf-using-axolotl">RLHF using Axolotl</a>
<ul>
<li><a href="#dpo" id="toc-dpo" class="nav-link" data-scroll-target="#dpo">DPO</a>
<ul class="collapse">
<li><a href="#chatml.argilla" id="toc-chatml.argilla" class="nav-link" data-scroll-target="#chatml.argilla">chatml.argilla</a></li>
<li><a href="#chatml.argilla_chat" id="toc-chatml.argilla_chat" class="nav-link" data-scroll-target="#chatml.argilla_chat">chatml.argilla_chat</a></li>
<li><a href="#chatml.icr" id="toc-chatml.icr" class="nav-link" data-scroll-target="#chatml.icr">chatml.icr</a></li>
<li><a href="#chatml.intel" id="toc-chatml.intel" class="nav-link" data-scroll-target="#chatml.intel">chatml.intel</a></li>
<li><a href="#chatml.prompt_pairs" id="toc-chatml.prompt_pairs" class="nav-link" data-scroll-target="#chatml.prompt_pairs">chatml.prompt_pairs</a></li>
<li><a href="#chatml.ultra" id="toc-chatml.ultra" class="nav-link" data-scroll-target="#chatml.ultra">chatml.ultra</a></li>
<li><a href="#llama3.argilla" id="toc-llama3.argilla" class="nav-link" data-scroll-target="#llama3.argilla">llama3.argilla</a></li>
<li><a href="#llama3.argilla_chat" id="toc-llama3.argilla_chat" class="nav-link" data-scroll-target="#llama3.argilla_chat">llama3.argilla_chat</a></li>
<li><a href="#llama3.icr" id="toc-llama3.icr" class="nav-link" data-scroll-target="#llama3.icr">llama3.icr</a></li>
<li><a href="#llama3.intel" id="toc-llama3.intel" class="nav-link" data-scroll-target="#llama3.intel">llama3.intel</a></li>
<li><a href="#llama3.prompt_pairs" id="toc-llama3.prompt_pairs" class="nav-link" data-scroll-target="#llama3.prompt_pairs">llama3.prompt_pairs</a></li>
<li><a href="#llama3.ultra" id="toc-llama3.ultra" class="nav-link" data-scroll-target="#llama3.ultra">llama3.ultra</a></li>
<li><a href="#zephyr.nectar" id="toc-zephyr.nectar" class="nav-link" data-scroll-target="#zephyr.nectar">zephyr.nectar</a></li>
<li><a href="#chat_template.argilla_chat" id="toc-chat_template.argilla_chat" class="nav-link" data-scroll-target="#chat_template.argilla_chat">chat_template.argilla_chat</a></li>
<li><a href="#chat_template.default" id="toc-chat_template.default" class="nav-link" data-scroll-target="#chat_template.default">chat_template.default</a></li>
<li><a href="#user_defined.default" id="toc-user_defined.default" class="nav-link" data-scroll-target="#user_defined.default">user_defined.default</a></li>
</ul></li>
<li><a href="#ipo" id="toc-ipo" class="nav-link" data-scroll-target="#ipo">IPO</a></li>
<li><a href="#orpo" id="toc-orpo" class="nav-link" data-scroll-target="#orpo">ORPO</a>
<ul class="collapse">
<li><a href="#chat_template.argilla" id="toc-chat_template.argilla" class="nav-link" data-scroll-target="#chat_template.argilla">chat_template.argilla</a></li>
</ul></li>
<li><a href="#kto" id="toc-kto" class="nav-link" data-scroll-target="#kto">KTO</a>
<ul class="collapse">
<li><a href="#chatml.argilla-1" id="toc-chatml.argilla-1" class="nav-link" data-scroll-target="#chatml.argilla-1">chatml.argilla</a></li>
<li><a href="#chatml.argilla_chat-1" id="toc-chatml.argilla_chat-1" class="nav-link" data-scroll-target="#chatml.argilla_chat-1">chatml.argilla_chat</a></li>
<li><a href="#chatml.intel-1" id="toc-chatml.intel-1" class="nav-link" data-scroll-target="#chatml.intel-1">chatml.intel</a></li>
<li><a href="#chatml.prompt_pairs-1" id="toc-chatml.prompt_pairs-1" class="nav-link" data-scroll-target="#chatml.prompt_pairs-1">chatml.prompt_pairs</a></li>
<li><a href="#chatml.ultra-1" id="toc-chatml.ultra-1" class="nav-link" data-scroll-target="#chatml.ultra-1">chatml.ultra</a></li>
<li><a href="#llama3.argilla-1" id="toc-llama3.argilla-1" class="nav-link" data-scroll-target="#llama3.argilla-1">llama3.argilla</a></li>
<li><a href="#llama3.argilla_chat-1" id="toc-llama3.argilla_chat-1" class="nav-link" data-scroll-target="#llama3.argilla_chat-1">llama3.argilla_chat</a></li>
<li><a href="#llama3.intel-1" id="toc-llama3.intel-1" class="nav-link" data-scroll-target="#llama3.intel-1">llama3.intel</a></li>
<li><a href="#llama3.prompt_pairs-1" id="toc-llama3.prompt_pairs-1" class="nav-link" data-scroll-target="#llama3.prompt_pairs-1">llama3.prompt_pairs</a></li>
<li><a href="#llama3.ultra-1" id="toc-llama3.ultra-1" class="nav-link" data-scroll-target="#llama3.ultra-1">llama3.ultra</a></li>
<li><a href="#user_defined.default-1" id="toc-user_defined.default-1" class="nav-link" data-scroll-target="#user_defined.default-1">user_defined.default</a></li>
</ul></li>
<li><a href="#grpo" id="toc-grpo" class="nav-link" data-scroll-target="#grpo">GRPO</a>
<ul class="collapse">
<li><a href="#reward-functions" id="toc-reward-functions" class="nav-link" data-scroll-target="#reward-functions">Reward functions</a></li>
<li><a href="#openenv-rollout-functions" id="toc-openenv-rollout-functions" class="nav-link" data-scroll-target="#openenv-rollout-functions">OpenEnv Rollout Functions</a></li>
<li><a href="#grpo-with-dapodr.-grpo-loss" id="toc-grpo-with-dapodr.-grpo-loss" class="nav-link" data-scroll-target="#grpo-with-dapodr.-grpo-loss">GRPO with DAPO/Dr.&nbsp;GRPO loss</a></li>
<li><a href="#async-grpo" id="toc-async-grpo" class="nav-link" data-scroll-target="#async-grpo">Async GRPO</a></li>
</ul></li>
<li><a href="#gdpo" id="toc-gdpo" class="nav-link" data-scroll-target="#gdpo">GDPO</a>
<ul class="collapse">
<li><a href="#gdpo-vs-grpo" id="toc-gdpo-vs-grpo" class="nav-link" data-scroll-target="#gdpo-vs-grpo">GDPO vs GRPO</a></li>
<li><a href="#why-gdpo" id="toc-why-gdpo" class="nav-link" data-scroll-target="#why-gdpo">Why GDPO?</a></li>
<li><a href="#reward-functions-1" id="toc-reward-functions-1" class="nav-link" data-scroll-target="#reward-functions-1">Reward Functions</a></li>
<li><a href="#sequence-parallelism" id="toc-sequence-parallelism" class="nav-link" data-scroll-target="#sequence-parallelism">Sequence Parallelism</a></li>
</ul></li>
<li><a href="#simpo" id="toc-simpo" class="nav-link" data-scroll-target="#simpo">SimPO</a></li>
<li><a href="#ebft" id="toc-ebft" class="nav-link" data-scroll-target="#ebft">EBFT</a>
<ul class="collapse">
<li><a href="#structured-mode" id="toc-structured-mode" class="nav-link" data-scroll-target="#structured-mode">Structured Mode</a></li>
<li><a href="#strided-mode" id="toc-strided-mode" class="nav-link" data-scroll-target="#strided-mode">Strided Mode</a></li>
<li><a href="#ebft-configuration-reference" id="toc-ebft-configuration-reference" class="nav-link" data-scroll-target="#ebft-configuration-reference">EBFT Configuration Reference</a></li>
</ul></li>
<li><a href="#nemo-gym-integration" id="toc-nemo-gym-integration" class="nav-link" data-scroll-target="#nemo-gym-integration">NeMo Gym Integration</a>
<ul class="collapse">
<li><a href="#single-turn-simplest" id="toc-single-turn-simplest" class="nav-link" data-scroll-target="#single-turn-simplest">Single-Turn (Simplest)</a></li>
<li><a href="#multi-turn-with-async-grpo-recommended" id="toc-multi-turn-with-async-grpo-recommended" class="nav-link" data-scroll-target="#multi-turn-with-async-grpo-recommended">Multi-Turn with Async GRPO (Recommended)</a></li>
<li><a href="#nemo-gym-prerequisites" id="toc-nemo-gym-prerequisites" class="nav-link" data-scroll-target="#nemo-gym-prerequisites">NeMo Gym Prerequisites</a></li>
<li><a href="#nemo-gym-configuration-reference" id="toc-nemo-gym-configuration-reference" class="nav-link" data-scroll-target="#nemo-gym-configuration-reference">NeMo Gym Configuration Reference</a></li>
<li><a href="#reward-functions-2" id="toc-reward-functions-2" class="nav-link" data-scroll-target="#reward-functions-2">Reward Functions</a></li>
</ul></li>
<li><a href="#using-local-dataset-files" id="toc-using-local-dataset-files" class="nav-link" data-scroll-target="#using-local-dataset-files">Using local dataset files</a></li>
<li><a href="#trl-auto-unwrapping-for-peft" id="toc-trl-auto-unwrapping-for-peft" class="nav-link" data-scroll-target="#trl-auto-unwrapping-for-peft">TRL auto-unwrapping for PEFT</a></li>
</ul></li>
</ul>
</nav>
</div>
<!-- main -->
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/multimodal.html">How To Guides</a></li><li class="breadcrumb-item"><a href="../docs/rlhf.html">RLHF (Beta)</a></li></ol></nav>
<div class="quarto-title">
<h1 class="title">RLHF (Beta)</h1>
</div>
<div>
<div class="description">
Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback.
</div>
</div>
<div class="quarto-title-meta">
</div>
</header>
<section id="overview" class="level2">
<h2 class="anchored" data-anchor-id="overview">Overview</h2>
<p>Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human
feedback. Various methods include, but not limited to:</p>
<ul>
<li><a href="#dpo">Direct Preference Optimization (DPO)</a></li>
<li><a href="#ipo">Identity Preference Optimization (IPO)</a></li>
<li><a href="#kto">Kahneman-Tversky Optimization (KTO)</a></li>
<li><a href="#orpo">Odds Ratio Preference Optimization (ORPO)</a></li>
<li><a href="#grpo">Group Relative Policy Optimization (GRPO)</a></li>
<li><a href="#gdpo">Group Reward-Decoupled Policy Optimization (GDPO)</a></li>
<li><a href="#ebft">Energy-Based Fine-Tuning (EBFT)</a></li>
<li><a href="#nemo-gym-integration">NeMo Gym Integration</a></li>
</ul>
</section>
<section id="rlhf-using-axolotl" class="level2">
<h2 class="anchored" data-anchor-id="rlhf-using-axolotl">RLHF using Axolotl</h2>
<div class="callout callout-style-default callout-important callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Important
</div>
</div>
<div class="callout-body-container callout-body">
<p>This is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.</p>
</div>
</div>
<p>We rely on the <a href="https://github.com/huggingface/trl">TRL</a> library for implementations of various RL training methods, which we wrap around to expose in axolotl. Each method has their own supported ways of loading datasets and prompt formats.</p>
<div class="callout callout-style-default callout-tip callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>You can find what each method supports by going into <code>src/axolotl/prompt_strategies/{method}</code> where <code>{method}</code> is one of our supported methods. The <code>type:</code> can be retrieved from <code>{method}.{function_name}</code>.</p>
</div>
</div>
<section id="dpo" class="level3">
<h3 class="anchored" data-anchor-id="dpo">DPO</h3>
<p>Example config:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> dpo</span></span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> Intel/orca_dpo_pairs</span></span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train</span></span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> chatml.intel</span></span>
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> argilla/ultrafeedback-binarized-preferences</span></span>
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train</span></span>
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> chatml</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>DPO supports the following types with the following dataset format:</p>
<section id="chatml.argilla" class="level4">
<h4 class="anchored" data-anchor-id="chatml.argilla">chatml.argilla</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"instruction"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen_response"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected_response"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="chatml.argilla_chat" class="level4">
<h4 class="anchored" data-anchor-id="chatml.argilla_chat">chatml.argilla_chat</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span><span class="fu">,</span></span>
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="chatml.icr" class="level4">
<h4 class="anchored" data-anchor-id="chatml.icr">chatml.icr</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"input"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="chatml.intel" class="level4">
<h4 class="anchored" data-anchor-id="chatml.intel">chatml.intel</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"question"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="chatml.prompt_pairs" class="level4">
<h4 class="anchored" data-anchor-id="chatml.prompt_pairs">chatml.prompt_pairs</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb6"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="chatml.ultra" class="level4">
<h4 class="anchored" data-anchor-id="chatml.ultra">chatml.ultra</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span><span class="fu">,</span></span>
<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="llama3.argilla" class="level4">
<h4 class="anchored" data-anchor-id="llama3.argilla">llama3.argilla</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb8"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"instruction"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen_response"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected_response"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="llama3.argilla_chat" class="level4">
<h4 class="anchored" data-anchor-id="llama3.argilla_chat">llama3.argilla_chat</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb9"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span><span class="fu">,</span></span>
<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="llama3.icr" class="level4">
<h4 class="anchored" data-anchor-id="llama3.icr">llama3.icr</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb10"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"input"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="llama3.intel" class="level4">
<h4 class="anchored" data-anchor-id="llama3.intel">llama3.intel</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb11"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"question"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="llama3.prompt_pairs" class="level4">
<h4 class="anchored" data-anchor-id="llama3.prompt_pairs">llama3.prompt_pairs</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb12"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="llama3.ultra" class="level4">
<h4 class="anchored" data-anchor-id="llama3.ultra">llama3.ultra</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb13"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span><span class="fu">,</span></span>
<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="zephyr.nectar" class="level4">
<h4 class="anchored" data-anchor-id="zephyr.nectar">zephyr.nectar</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb14"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"answers"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"answer"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rank"</span><span class="fu">:</span> <span class="dv">1</span></span>
<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span><span class="ot">,</span></span>
<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a> <span class="dt">"answer"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rank"</span><span class="fu">:</span> <span class="dv">2</span></span>
<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span></span>
<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a> <span class="er">//</span> <span class="er">...</span> <span class="er">more</span> <span class="er">answers</span> <span class="er">with</span> <span class="er">ranks</span></span>
<span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
<span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="chat_template.argilla_chat" class="level4">
<h4 class="anchored" data-anchor-id="chat_template.argilla_chat">chat_template.argilla_chat</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb15"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span><span class="fu">,</span></span>
<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="chat_template.default" class="level4">
<h4 class="anchored" data-anchor-id="chat_template.default">chat_template.default</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb16"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> dpo</span></span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> ...</span></span>
<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train</span></span>
<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> chat_template.default</span></span>
<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> </span><span class="st">"messages"</span></span>
<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_chosen</span><span class="kw">:</span><span class="at"> </span><span class="st">"chosen"</span></span>
<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_rejected</span><span class="kw">:</span><span class="at"> </span><span class="st">"rejected"</span></span>
<span id="cb16-9"><a href="#cb16-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">message_property_mappings</span><span class="kw">:</span></span>
<span id="cb16-10"><a href="#cb16-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">role</span><span class="kw">:</span><span class="at"> role</span></span>
<span id="cb16-11"><a href="#cb16-11" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">content</span><span class="kw">:</span><span class="at"> content</span></span>
<span id="cb16-12"><a href="#cb16-12" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">roles</span><span class="kw">:</span></span>
<span id="cb16-13"><a href="#cb16-13" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">user</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"user"</span><span class="kw">]</span></span>
<span id="cb16-14"><a href="#cb16-14" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">assistant</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"assistant"</span><span class="kw">]</span></span>
<span id="cb16-15"><a href="#cb16-15" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">system</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"system"</span><span class="kw">]</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>Sample input format:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb17"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"messages"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"system"</span><span class="fu">,</span></span>
<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span><span class="ot">,</span></span>
<span id="cb17-7"><a href="#cb17-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
<span id="cb17-8"><a href="#cb17-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span></span>
<span id="cb17-9"><a href="#cb17-9" aria-hidden="true" tabindex="-1"></a> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb17-10"><a href="#cb17-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span><span class="ot">,</span></span>
<span id="cb17-11"><a href="#cb17-11" aria-hidden="true" tabindex="-1"></a> <span class="er">//</span> <span class="er">...</span> <span class="er">more</span> <span class="er">messages</span></span>
<span id="cb17-12"><a href="#cb17-12" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span><span class="fu">,</span></span>
<span id="cb17-13"><a href="#cb17-13" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="fu">{</span></span>
<span id="cb17-14"><a href="#cb17-14" aria-hidden="true" tabindex="-1"></a> <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span></span>
<span id="cb17-15"><a href="#cb17-15" aria-hidden="true" tabindex="-1"></a> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb17-16"><a href="#cb17-16" aria-hidden="true" tabindex="-1"></a> <span class="fu">},</span></span>
<span id="cb17-17"><a href="#cb17-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="fu">{</span></span>
<span id="cb17-18"><a href="#cb17-18" aria-hidden="true" tabindex="-1"></a> <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span></span>
<span id="cb17-19"><a href="#cb17-19" aria-hidden="true" tabindex="-1"></a> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb17-20"><a href="#cb17-20" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span></span>
<span id="cb17-21"><a href="#cb17-21" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="user_defined.default" class="level4">
<h4 class="anchored" data-anchor-id="user_defined.default">user_defined.default</h4>
<p>For custom behaviors,</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb18"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> dpo</span></span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> ...</span></span>
<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train</span></span>
<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span></span>
<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_prompt</span><span class="kw">:</span><span class="at"> </span><span class="st">"prompt"</span></span>
<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> </span><span class="st">"system"</span></span>
<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_chosen</span><span class="kw">:</span><span class="at"> </span><span class="st">"chosen"</span></span>
<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_rejected</span><span class="kw">:</span><span class="at"> </span><span class="st">"rejected"</span></span>
<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">prompt_format</span><span class="kw">:</span><span class="at"> </span><span class="st">"{prompt}"</span></span>
<span id="cb18-11"><a href="#cb18-11" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">chosen_format</span><span class="kw">:</span><span class="at"> </span><span class="st">"{chosen}"</span></span>
<span id="cb18-12"><a href="#cb18-12" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">rejected_format</span><span class="kw">:</span><span class="at"> </span><span class="st">"{rejected}"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>The input format is a simple JSON input with customizable fields based on the above config.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb19"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
</section>
<section id="ipo" class="level3">
<h3 class="anchored" data-anchor-id="ipo">IPO</h3>
<p>As IPO is just DPO with a different loss function, all supported dataset formats for <a href="#dpo">DPO</a> are also supported for IPO.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb20"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> ipo</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="orpo" class="level3">
<h3 class="anchored" data-anchor-id="orpo">ORPO</h3>
<p>Paper: https://arxiv.org/abs/2403.07691</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb21"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> orpo</span></span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="fu">orpo_alpha</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.1</span></span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="fu">remove_unused_columns</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> chatml</span></span>
<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> argilla/ultrafeedback-binarized-preferences-cleaned</span></span>
<span id="cb21-8"><a href="#cb21-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> chat_template.argilla</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>ORPO supports the following types with the following dataset format:</p>
<section id="chat_template.argilla" class="level4">
<h4 class="anchored" data-anchor-id="chat_template.argilla">chat_template.argilla</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb22"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">if</span> <span class="er">available,</span> <span class="er">will</span> <span class="er">be</span> <span class="er">taken</span> <span class="er">as</span> <span class="er">user</span> <span class="er">message</span> <span class="er">for</span> <span class="er">single-turn</span> <span class="er">instead</span> <span class="er">of</span> <span class="er">from</span> <span class="er">list</span> <span class="er">below</span></span>
<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a> <span class="er">//</span> <span class="er">chosen/rejected</span> <span class="er">should</span> <span class="er">be</span> <span class="er">same</span> <span class="er">till</span> <span class="er">last</span> <span class="er">content</span> <span class="er">and</span> <span class="er">only</span> <span class="er">even-number</span> <span class="er">of</span> <span class="er">alternating</span> <span class="er">user/assistant</span> <span class="er">turns</span></span>
<span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb22-7"><a href="#cb22-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb22-8"><a href="#cb22-8" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb22-9"><a href="#cb22-9" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span><span class="fu">,</span></span>
<span id="cb22-10"><a href="#cb22-10" aria-hidden="true" tabindex="-1"></a> <span class="dt">"rejected"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb22-11"><a href="#cb22-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb22-12"><a href="#cb22-12" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb22-13"><a href="#cb22-13" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
<span id="cb22-14"><a href="#cb22-14" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
</section>
<section id="kto" class="level3">
<h3 class="anchored" data-anchor-id="kto">KTO</h3>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb23"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> kto</span></span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="fu">rl_beta</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.1</span><span class="co"> # default</span></span>
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_desirable_weight</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span><span class="co"> # default</span></span>
<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_undesirable_weight</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span><span class="co"> # default</span></span>
<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-6"><a href="#cb23-6" aria-hidden="true" tabindex="-1"></a><span class="fu">remove_unused_columns</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
<span id="cb23-7"><a href="#cb23-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-8"><a href="#cb23-8" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb23-9"><a href="#cb23-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> argilla/ultrafeedback-binarized-preferences-cleaned-kto</span></span>
<span id="cb23-10"><a href="#cb23-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> llama3.ultra</span></span>
<span id="cb23-11"><a href="#cb23-11" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train</span></span>
<span id="cb23-12"><a href="#cb23-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-13"><a href="#cb23-13" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb23-14"><a href="#cb23-14" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing_kwargs</span><span class="kw">:</span></span>
<span id="cb23-15"><a href="#cb23-15" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_reentrant</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>KTO supports the following types with the following dataset format:</p>
<section id="chatml.argilla-1" class="level4">
<h4 class="anchored" data-anchor-id="chatml.argilla-1">chatml.argilla</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb24"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"instruction"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"completion"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="chatml.argilla_chat-1" class="level4">
<h4 class="anchored" data-anchor-id="chatml.argilla_chat-1">chatml.argilla_chat</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb25"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"chosen"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb25-4"><a href="#cb25-4" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span><span class="fu">,</span></span>
<span id="cb25-5"><a href="#cb25-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"completion"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb25-6"><a href="#cb25-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb25-7"><a href="#cb25-7" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
<span id="cb25-8"><a href="#cb25-8" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="chatml.intel-1" class="level4">
<h4 class="anchored" data-anchor-id="chatml.intel-1">chatml.intel</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb26"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"question"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"completion"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="chatml.prompt_pairs-1" class="level4">
<h4 class="anchored" data-anchor-id="chatml.prompt_pairs-1">chatml.prompt_pairs</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb27"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"completion"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="chatml.ultra-1" class="level4">
<h4 class="anchored" data-anchor-id="chatml.ultra-1">chatml.ultra</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb28"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb28-4"><a href="#cb28-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"completion"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb28-5"><a href="#cb28-5" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="llama3.argilla-1" class="level4">
<h4 class="anchored" data-anchor-id="llama3.argilla-1">llama3.argilla</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb29"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"instruction"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"completion"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb29-5"><a href="#cb29-5" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="llama3.argilla_chat-1" class="level4">
<h4 class="anchored" data-anchor-id="llama3.argilla_chat-1">llama3.argilla_chat</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb30"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"completion"</span><span class="fu">:</span> <span class="ot">[</span></span>
<span id="cb30-3"><a href="#cb30-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">,</span></span>
<span id="cb30-4"><a href="#cb30-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span>
<span id="cb30-5"><a href="#cb30-5" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
<span id="cb30-6"><a href="#cb30-6" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="llama3.intel-1" class="level4">
<h4 class="anchored" data-anchor-id="llama3.intel-1">llama3.intel</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb31"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"question"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb31-4"><a href="#cb31-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"completion"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb31-5"><a href="#cb31-5" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="llama3.prompt_pairs-1" class="level4">
<h4 class="anchored" data-anchor-id="llama3.prompt_pairs-1">llama3.prompt_pairs</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb32"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb32-2"><a href="#cb32-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb32-3"><a href="#cb32-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb32-4"><a href="#cb32-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"completion"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb32-5"><a href="#cb32-5" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="llama3.ultra-1" class="level4">
<h4 class="anchored" data-anchor-id="llama3.ultra-1">llama3.ultra</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb33"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"completion"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb33-5"><a href="#cb33-5" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="user_defined.default-1" class="level4">
<h4 class="anchored" data-anchor-id="user_defined.default-1">user_defined.default</h4>
<p>For custom behaviors,</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb34"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> kto</span></span>
<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb34-3"><a href="#cb34-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> ...</span></span>
<span id="cb34-4"><a href="#cb34-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train</span></span>
<span id="cb34-5"><a href="#cb34-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span></span>
<span id="cb34-6"><a href="#cb34-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_prompt</span><span class="kw">:</span><span class="at"> </span><span class="st">"prompt"</span></span>
<span id="cb34-7"><a href="#cb34-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> </span><span class="st">"system"</span></span>
<span id="cb34-8"><a href="#cb34-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_completion</span><span class="kw">:</span><span class="at"> </span><span class="st">"completion"</span></span>
<span id="cb34-9"><a href="#cb34-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_label</span><span class="kw">:</span><span class="at"> </span><span class="st">"label"</span></span>
<span id="cb34-10"><a href="#cb34-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">prompt_format</span><span class="kw">:</span><span class="at"> </span><span class="st">"{prompt}"</span></span>
<span id="cb34-11"><a href="#cb34-11" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">completion_format</span><span class="kw">:</span><span class="at"> </span><span class="st">"{completion}"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>The input format is a simple JSON input with customizable fields based on the above config.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb35"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"system"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="er">//</span> <span class="er">optional</span></span>
<span id="cb35-3"><a href="#cb35-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">"prompt"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb35-4"><a href="#cb35-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"completion"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span></span>
<span id="cb35-5"><a href="#cb35-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"label"</span><span class="fu">:</span> <span class="st">"..."</span></span>
<span id="cb35-6"><a href="#cb35-6" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
</section>
<section id="grpo" class="level3">
<h3 class="anchored" data-anchor-id="grpo">GRPO</h3>
<div class="callout callout-style-default callout-tip callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>Check out our <a href="https://github.com/axolotl-ai-cloud/grpo_code">GRPO cookbook</a>.</p>
</div>
</div>
<p>In the latest GRPO implementation, <code>vLLM</code> is used to significantly speedup trajectory generation during training. In this example, were using 4 GPUs - 2 for training, and 2 for vLLM:</p>
<div class="callout callout-style-default callout-important callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Important
</div>
</div>
<div class="callout-body-container callout-body">
<p>Make sure youve installed the correct version of vLLM by including it as an extra when installing axolotl, e.g.&nbsp;<code>pip install axolotl[vllm]</code>.</p>
</div>
</div>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb36"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> Qwen/Qwen2.5-1.5B-Instruct</span></span>
<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-3"><a href="#cb36-3" aria-hidden="true" tabindex="-1"></a><span class="fu">vllm</span><span class="kw">:</span></span>
<span id="cb36-4"><a href="#cb36-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">host</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.0.0.0</span></span>
<span id="cb36-5"><a href="#cb36-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">port</span><span class="kw">:</span><span class="at"> </span><span class="dv">8000</span></span>
<span id="cb36-6"><a href="#cb36-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">tensor_parallel_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
<span id="cb36-7"><a href="#cb36-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">gpu_memory_utilization</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.85</span></span>
<span id="cb36-8"><a href="#cb36-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">dtype</span><span class="kw">:</span><span class="at"> auto</span></span>
<span id="cb36-9"><a href="#cb36-9" aria-hidden="true" tabindex="-1"></a><span class="co"> # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand</span></span>
<span id="cb36-10"><a href="#cb36-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb36-11"><a href="#cb36-11" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> grpo</span></span>
<span id="cb36-12"><a href="#cb36-12" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb36-13"><a href="#cb36-13" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_vllm</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb36-14"><a href="#cb36-14" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_server_host</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.0.0.0</span></span>
<span id="cb36-15"><a href="#cb36-15" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_server_port</span><span class="kw">:</span><span class="at"> </span><span class="dv">8000</span></span>
<span id="cb36-16"><a href="#cb36-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_server_timeout</span><span class="kw">:</span><span class="at"> </span><span class="dv">300</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb37"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>2,3 <span class="ex">axolotl</span> vllm-serve grpo.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>Your <code>vLLM</code> instance will now attempt to spin up, and its time to kick off training utilizing our remaining two GPUs. In another terminal, execute:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb38"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>0,1 <span class="ex">axolotl</span> train grpo.yaml <span class="at">--num-processes</span> 2</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Note
</div>
</div>
<div class="callout-body-container callout-body">
<p>Due to TRLs implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use <code>CUDA_VISIBLE_DEVICES=2,3</code> for the vLLM instance.</p>
</div>
</div>
<section id="reward-functions" class="level4">
<h4 class="anchored" data-anchor-id="reward-functions">Reward functions</h4>
<p>GRPO uses custom reward functions and transformations. Please have them ready locally.</p>
<p>For example, to load OpenAIs GSM8K and use a random reward for completions:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb39"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a><span class="co"># rewards.py</span></span>
<span id="cb39-2"><a href="#cb39-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> random</span>
<span id="cb39-3"><a href="#cb39-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb39-4"><a href="#cb39-4" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> rand_reward_func(completions, <span class="op">**</span>kwargs) <span class="op">-&gt;</span> <span class="bu">list</span>[<span class="bu">float</span>]:</span>
<span id="cb39-5"><a href="#cb39-5" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> [random.uniform(<span class="dv">0</span>, <span class="dv">1</span>) <span class="cf">for</span> _ <span class="kw">in</span> completions]</span>
<span id="cb39-6"><a href="#cb39-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb39-7"><a href="#cb39-7" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> oai_gsm8k_transform(cfg, <span class="op">*</span>args, <span class="op">**</span>kwargs):</span>
<span id="cb39-8"><a href="#cb39-8" aria-hidden="true" tabindex="-1"></a> <span class="kw">def</span> transform_fn(example, tokenizer<span class="op">=</span><span class="va">None</span>):</span>
<span id="cb39-9"><a href="#cb39-9" aria-hidden="true" tabindex="-1"></a> label <span class="op">=</span> example[<span class="st">"answer"</span>].split(<span class="st">"####"</span>)[<span class="op">-</span><span class="dv">1</span>].strip().replace(<span class="st">","</span>, <span class="st">""</span>)</span>
<span id="cb39-10"><a href="#cb39-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {</span>
<span id="cb39-11"><a href="#cb39-11" aria-hidden="true" tabindex="-1"></a> <span class="st">"prompt"</span>: [{<span class="st">"role"</span>: <span class="st">"user"</span>, <span class="st">"content"</span>: example[<span class="st">"question"</span>]},],</span>
<span id="cb39-12"><a href="#cb39-12" aria-hidden="true" tabindex="-1"></a> <span class="st">"answer"</span>: label,</span>
<span id="cb39-13"><a href="#cb39-13" aria-hidden="true" tabindex="-1"></a> }</span>
<span id="cb39-14"><a href="#cb39-14" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> transform_fn, {<span class="st">"remove_columns"</span>: [<span class="st">"question"</span>]}</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb40"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb40-1"><a href="#cb40-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> grpo</span></span>
<span id="cb40-2"><a href="#cb40-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb40-3"><a href="#cb40-3" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb40-4"><a href="#cb40-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">beta</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.001</span></span>
<span id="cb40-5"><a href="#cb40-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">max_completion_length</span><span class="kw">:</span><span class="at"> </span><span class="dv">256</span></span>
<span id="cb40-6"><a href="#cb40-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_vllm</span><span class="kw">:</span><span class="at"> </span><span class="ch">True</span></span>
<span id="cb40-7"><a href="#cb40-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">num_generations</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span></span>
<span id="cb40-8"><a href="#cb40-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reward_funcs</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"rewards.rand_reward_func"</span><span class="kw">]</span><span class="co"> # format: '{file_name}.{fn_name}'</span></span>
<span id="cb40-9"><a href="#cb40-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reward_weights</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="fl">1.0</span><span class="kw">]</span></span>
<span id="cb40-10"><a href="#cb40-10" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb40-11"><a href="#cb40-11" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> openai/gsm8k</span></span>
<span id="cb40-12"><a href="#cb40-12" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> main</span></span>
<span id="cb40-13"><a href="#cb40-13" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> rewards.oai_gsm8k_transform</span><span class="co"> # format: '{file_name}.{fn_name}'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>To see other examples of custom reward functions, please see <a href="https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function">TRL GRPO Docs</a>.</p>
<p>To see all configs, please see <a href="https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py">TRLConfig</a>.</p>
</section>
<section id="openenv-rollout-functions" class="level4">
<h4 class="anchored" data-anchor-id="openenv-rollout-functions">OpenEnv Rollout Functions</h4>
<p>GRPO supports custom rollout functions for OpenEnv-style environments, enabling interactive tasks like web browsing, code execution, or tool use. This allows you to implement custom generation logic that interacts with external environments.</p>
<p>For example, to implement a simple math-solving environment with step-by-step verification:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb41"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a><span class="co"># math_env.py</span></span>
<span id="cb41-2"><a href="#cb41-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> re</span>
<span id="cb41-3"><a href="#cb41-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-4"><a href="#cb41-4" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> math_solver_rollout(model, processing_class, prompts, generation_config<span class="op">=</span><span class="va">None</span>):</span>
<span id="cb41-5"><a href="#cb41-5" aria-hidden="true" tabindex="-1"></a> <span class="co">"""</span></span>
<span id="cb41-6"><a href="#cb41-6" aria-hidden="true" tabindex="-1"></a><span class="co"> Custom rollout function that generates step-by-step math solutions.</span></span>
<span id="cb41-7"><a href="#cb41-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-8"><a href="#cb41-8" aria-hidden="true" tabindex="-1"></a><span class="co"> Args:</span></span>
<span id="cb41-9"><a href="#cb41-9" aria-hidden="true" tabindex="-1"></a><span class="co"> model: The language model</span></span>
<span id="cb41-10"><a href="#cb41-10" aria-hidden="true" tabindex="-1"></a><span class="co"> processing_class: The tokenizer/processing_class</span></span>
<span id="cb41-11"><a href="#cb41-11" aria-hidden="true" tabindex="-1"></a><span class="co"> prompts: List of prompt dicts (with 'messages' key for chat format)</span></span>
<span id="cb41-12"><a href="#cb41-12" aria-hidden="true" tabindex="-1"></a><span class="co"> generation_config: Optional generation configuration</span></span>
<span id="cb41-13"><a href="#cb41-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-14"><a href="#cb41-14" aria-hidden="true" tabindex="-1"></a><span class="co"> Returns:</span></span>
<span id="cb41-15"><a href="#cb41-15" aria-hidden="true" tabindex="-1"></a><span class="co"> List of completion strings</span></span>
<span id="cb41-16"><a href="#cb41-16" aria-hidden="true" tabindex="-1"></a><span class="co"> """</span></span>
<span id="cb41-17"><a href="#cb41-17" aria-hidden="true" tabindex="-1"></a> completions <span class="op">=</span> []</span>
<span id="cb41-18"><a href="#cb41-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-19"><a href="#cb41-19" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> prompt <span class="kw">in</span> prompts:</span>
<span id="cb41-20"><a href="#cb41-20" aria-hidden="true" tabindex="-1"></a> <span class="co"># Apply chat template to prompt</span></span>
<span id="cb41-21"><a href="#cb41-21" aria-hidden="true" tabindex="-1"></a> messages <span class="op">=</span> prompt.get(<span class="st">"messages"</span>, [])</span>
<span id="cb41-22"><a href="#cb41-22" aria-hidden="true" tabindex="-1"></a> formatted_prompt <span class="op">=</span> processing_class.apply_chat_template(</span>
<span id="cb41-23"><a href="#cb41-23" aria-hidden="true" tabindex="-1"></a> messages, processing_class<span class="op">=</span><span class="va">False</span>, add_generation_prompt<span class="op">=</span><span class="va">True</span></span>
<span id="cb41-24"><a href="#cb41-24" aria-hidden="true" tabindex="-1"></a> )</span>
<span id="cb41-25"><a href="#cb41-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-26"><a href="#cb41-26" aria-hidden="true" tabindex="-1"></a> <span class="co"># Generate step-by-step solution</span></span>
<span id="cb41-27"><a href="#cb41-27" aria-hidden="true" tabindex="-1"></a> full_response <span class="op">=</span> <span class="st">""</span></span>
<span id="cb41-28"><a href="#cb41-28" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> step <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">5</span>): <span class="co"># Max 5 reasoning steps</span></span>
<span id="cb41-29"><a href="#cb41-29" aria-hidden="true" tabindex="-1"></a> current_input <span class="op">=</span> formatted_prompt <span class="op">+</span> full_response <span class="op">+</span> <span class="st">"</span><span class="ch">\n</span><span class="st">Next step:"</span></span>
<span id="cb41-30"><a href="#cb41-30" aria-hidden="true" tabindex="-1"></a> inputs <span class="op">=</span> processing_class(current_input, return_tensors<span class="op">=</span><span class="st">"pt"</span>).to(model.device)</span>
<span id="cb41-31"><a href="#cb41-31" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-32"><a href="#cb41-32" aria-hidden="true" tabindex="-1"></a> outputs <span class="op">=</span> model.generate(</span>
<span id="cb41-33"><a href="#cb41-33" aria-hidden="true" tabindex="-1"></a> <span class="op">**</span>inputs,</span>
<span id="cb41-34"><a href="#cb41-34" aria-hidden="true" tabindex="-1"></a> max_new_tokens<span class="op">=</span><span class="dv">100</span>,</span>
<span id="cb41-35"><a href="#cb41-35" aria-hidden="true" tabindex="-1"></a> generation_config<span class="op">=</span>generation_config,</span>
<span id="cb41-36"><a href="#cb41-36" aria-hidden="true" tabindex="-1"></a> )</span>
<span id="cb41-37"><a href="#cb41-37" aria-hidden="true" tabindex="-1"></a> step_text <span class="op">=</span> processing_class.decode(</span>
<span id="cb41-38"><a href="#cb41-38" aria-hidden="true" tabindex="-1"></a> outputs[<span class="dv">0</span>][inputs.input_ids.shape[<span class="dv">1</span>]:],</span>
<span id="cb41-39"><a href="#cb41-39" aria-hidden="true" tabindex="-1"></a> skip_special_tokens<span class="op">=</span><span class="va">True</span></span>
<span id="cb41-40"><a href="#cb41-40" aria-hidden="true" tabindex="-1"></a> )</span>
<span id="cb41-41"><a href="#cb41-41" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-42"><a href="#cb41-42" aria-hidden="true" tabindex="-1"></a> <span class="co"># Check if solution is complete</span></span>
<span id="cb41-43"><a href="#cb41-43" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> <span class="st">"FINAL ANSWER:"</span> <span class="kw">in</span> step_text:</span>
<span id="cb41-44"><a href="#cb41-44" aria-hidden="true" tabindex="-1"></a> full_response <span class="op">+=</span> step_text</span>
<span id="cb41-45"><a href="#cb41-45" aria-hidden="true" tabindex="-1"></a> <span class="cf">break</span></span>
<span id="cb41-46"><a href="#cb41-46" aria-hidden="true" tabindex="-1"></a> full_response <span class="op">+=</span> step_text <span class="op">+</span> <span class="st">"</span><span class="ch">\n</span><span class="st">"</span></span>
<span id="cb41-47"><a href="#cb41-47" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-48"><a href="#cb41-48" aria-hidden="true" tabindex="-1"></a> completions.append(full_response)</span>
<span id="cb41-49"><a href="#cb41-49" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-50"><a href="#cb41-50" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> completions</span>
<span id="cb41-51"><a href="#cb41-51" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-52"><a href="#cb41-52" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> math_reward(prompts, completions, answers, <span class="op">**</span>kwargs):</span>
<span id="cb41-53"><a href="#cb41-53" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Reward function that checks mathematical correctness"""</span></span>
<span id="cb41-54"><a href="#cb41-54" aria-hidden="true" tabindex="-1"></a> rewards <span class="op">=</span> []</span>
<span id="cb41-55"><a href="#cb41-55" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> completion, correct_answer <span class="kw">in</span> <span class="bu">zip</span>(completions, answers):</span>
<span id="cb41-56"><a href="#cb41-56" aria-hidden="true" tabindex="-1"></a> <span class="co"># Extract predicted answer</span></span>
<span id="cb41-57"><a href="#cb41-57" aria-hidden="true" tabindex="-1"></a> match <span class="op">=</span> re.search(<span class="vs">r"FINAL ANSWER:</span><span class="dv">\s</span><span class="op">*</span><span class="kw">(</span><span class="dv">.</span><span class="op">+</span><span class="kw">)</span><span class="vs">"</span>, completion)</span>
<span id="cb41-58"><a href="#cb41-58" aria-hidden="true" tabindex="-1"></a> predicted <span class="op">=</span> match.group(<span class="dv">1</span>).strip() <span class="cf">if</span> match <span class="cf">else</span> <span class="st">""</span></span>
<span id="cb41-59"><a href="#cb41-59" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-60"><a href="#cb41-60" aria-hidden="true" tabindex="-1"></a> <span class="co"># Compare with correct answer</span></span>
<span id="cb41-61"><a href="#cb41-61" aria-hidden="true" tabindex="-1"></a> reward <span class="op">=</span> <span class="fl">1.0</span> <span class="cf">if</span> predicted <span class="op">==</span> <span class="bu">str</span>(correct_answer) <span class="cf">else</span> <span class="fl">0.0</span></span>
<span id="cb41-62"><a href="#cb41-62" aria-hidden="true" tabindex="-1"></a> rewards.append(reward)</span>
<span id="cb41-63"><a href="#cb41-63" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-64"><a href="#cb41-64" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> rewards</span>
<span id="cb41-65"><a href="#cb41-65" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb41-66"><a href="#cb41-66" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> math_transform(cfg, <span class="op">*</span>args, <span class="op">**</span>kwargs):</span>
<span id="cb41-67"><a href="#cb41-67" aria-hidden="true" tabindex="-1"></a> <span class="co">"""Transform dataset to GRPO format with answer field"""</span></span>
<span id="cb41-68"><a href="#cb41-68" aria-hidden="true" tabindex="-1"></a> <span class="kw">def</span> transform_fn(example, processing_class<span class="op">=</span><span class="va">None</span>):</span>
<span id="cb41-69"><a href="#cb41-69" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> {</span>
<span id="cb41-70"><a href="#cb41-70" aria-hidden="true" tabindex="-1"></a> <span class="st">"prompt"</span>: [{<span class="st">"role"</span>: <span class="st">"user"</span>, <span class="st">"content"</span>: example[<span class="st">"question"</span>]}],</span>
<span id="cb41-71"><a href="#cb41-71" aria-hidden="true" tabindex="-1"></a> <span class="st">"answer"</span>: <span class="bu">str</span>(example[<span class="st">"answer"</span>]),</span>
<span id="cb41-72"><a href="#cb41-72" aria-hidden="true" tabindex="-1"></a> }</span>
<span id="cb41-73"><a href="#cb41-73" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> transform_fn, {<span class="st">"remove_columns"</span>: [<span class="st">"question"</span>]}</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb42"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> grpo</span></span>
<span id="cb42-2"><a href="#cb42-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb42-3"><a href="#cb42-3" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb42-4"><a href="#cb42-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">beta</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.001</span></span>
<span id="cb42-5"><a href="#cb42-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">max_completion_length</span><span class="kw">:</span><span class="at"> </span><span class="dv">512</span></span>
<span id="cb42-6"><a href="#cb42-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">num_generations</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span></span>
<span id="cb42-7"><a href="#cb42-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">rollout_func</span><span class="kw">:</span><span class="at"> </span><span class="st">"math_env.math_solver_rollout"</span><span class="co"> # Custom rollout function</span></span>
<span id="cb42-8"><a href="#cb42-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reward_funcs</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"math_env.math_reward"</span><span class="kw">]</span></span>
<span id="cb42-9"><a href="#cb42-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reward_weights</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="fl">1.0</span><span class="kw">]</span></span>
<span id="cb42-10"><a href="#cb42-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb42-11"><a href="#cb42-11" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb42-12"><a href="#cb42-12" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> openai/gsm8k</span></span>
<span id="cb42-13"><a href="#cb42-13" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> main</span></span>
<span id="cb42-14"><a href="#cb42-14" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> math_env.math_transform</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>The <code>rollout_func</code> parameter accepts a fully qualified name (e.g., <code>module_name.function_name</code>) that points to a callable function in your local directory. The function receives:</p>
<ul>
<li><code>model</code>: The language model</li>
<li><code>processing_class</code>: The tokenizer/processing class</li>
<li><code>prompts</code>: List of prompt dictionaries</li>
<li><code>generation_config</code> (optional): Generation configuration</li>
</ul>
<p>And should return a list of completion strings.</p>
<p>For more OpenEnv examples, see <a href="https://huggingface.co/docs/trl/main/en/openenv">TRL OpenEnv Documentation</a>.</p>
</section>
<section id="grpo-with-dapodr.-grpo-loss" class="level4">
<h4 class="anchored" data-anchor-id="grpo-with-dapodr.-grpo-loss">GRPO with DAPO/Dr.&nbsp;GRPO loss</h4>
<p>The DAPO paper and subsequently Dr.&nbsp;GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb43"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb43-2"><a href="#cb43-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">loss_type</span><span class="kw">:</span><span class="at"> dr_grpo</span></span>
<span id="cb43-3"><a href="#cb43-3" aria-hidden="true" tabindex="-1"></a><span class="co"> # Normalizes loss based on max completion length (default: 256)</span></span>
<span id="cb43-4"><a href="#cb43-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">max_completion_length</span><span class="kw">:</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>For more information, see <a href="https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types">GRPO docs</a>.</p>
</section>
<section id="async-grpo" class="level4">
<h4 class="anchored" data-anchor-id="async-grpo">Async GRPO</h4>
<p>Async GRPO overlaps vLLM generation with training by producing rollouts in a background thread. While the model trains on the current batch, the next batch is already being generated. This can significantly reduce wall-clock time per step.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb44"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb44-1"><a href="#cb44-1" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb44-2"><a href="#cb44-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_data_producer</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Enable data producer protocol</span></span>
<span id="cb44-3"><a href="#cb44-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_vllm</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb44-4"><a href="#cb44-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">async_prefetch</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Generate rollouts in background thread</span></span>
<span id="cb44-5"><a href="#cb44-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">prefetch_depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span><span class="co"> # Number of rollouts to prefetch</span></span>
<span id="cb44-6"><a href="#cb44-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_sync_interval</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span><span class="co"> # Sync weights to vLLM every N steps</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Note
</div>
</div>
<div class="callout-body-container callout-body">
<p>Because the background thread generates completions with slightly stale model weights, async GRPO uses importance sampling correction to account for the distribution shift. This is controlled by <code>vllm_importance_sampling_correction: true</code> (default when async is enabled).</p>
</div>
</div>
<section id="vllm-lora-sync" class="level5">
<h5 class="anchored" data-anchor-id="vllm-lora-sync">vLLM LoRA Sync</h5>
<p>By default, weight sync to vLLM merges the LoRA adapter into the base model and broadcasts all parameters via NCCL. LoRA sync is a faster alternative that saves only the adapter weights to the filesystem and has vLLM load them natively using Punica kernels.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb45"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb45-1"><a href="#cb45-1" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> lora</span></span>
<span id="cb45-2"><a href="#cb45-2" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> </span><span class="dv">32</span></span>
<span id="cb45-3"><a href="#cb45-3" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> </span><span class="dv">64</span></span>
<span id="cb45-4"><a href="#cb45-4" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_linear</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb45-5"><a href="#cb45-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb45-6"><a href="#cb45-6" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb45-7"><a href="#cb45-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_lora_sync</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Enable native LoRA sync</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>When <code>vllm_lora_sync: true</code> is set, axolotl automatically selects the LoRA-aware vLLM serve module. Start vLLM as usual:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb46"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>0 <span class="ex">axolotl</span> vllm-serve config.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>Then start training on a separate GPU:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb47"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb47-1"><a href="#cb47-1" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>1 <span class="ex">axolotl</span> train config.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="callout callout-style-default callout-tip callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>LoRA sync is especially beneficial with multi-GPU training (FSDP/DeepSpeed), where NCCL merge-sync can cause GPU contention with vLLM generation.</p>
</div>
</div>
</section>
<section id="streaming-partial-batch" class="level5">
<h5 class="anchored" data-anchor-id="streaming-partial-batch">Streaming Partial Batch</h5>
<p>Instead of scoring the entire batch at once, streaming mode scores one prompt group at a time. This enables finer-grained zero-advantage skipping and reduces peak memory usage during scoring.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb48"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb48-1"><a href="#cb48-1" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb48-2"><a href="#cb48-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">streaming_partial_batch</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="importance-sampling-correction" class="level5">
<h5 class="anchored" data-anchor-id="importance-sampling-correction">Importance Sampling Correction</h5>
<p>When using async prefetch, completions are generated from a slightly older version of the model. Importance sampling (IS) correction adjusts the policy gradient to account for this distribution shift.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb49"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb49-1"><a href="#cb49-1" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb49-2"><a href="#cb49-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_importance_sampling_correction</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Enable IS correction</span></span>
<span id="cb49-3"><a href="#cb49-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">importance_sampling_level</span><span class="kw">:</span><span class="at"> token</span><span class="co"> # 'token' or 'sequence'</span></span>
<span id="cb49-4"><a href="#cb49-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">off_policy_mask_threshold</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.5</span><span class="co"> # Mask sequences with IS ratio below this</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<ul>
<li><code>importance_sampling_level: token</code> applies per-token IS ratios (recommended with Liger kernel)</li>
<li><code>importance_sampling_level: sequence</code> applies per-sequence IS ratios</li>
<li><code>off_policy_mask_threshold</code> masks out sequences where the IS ratio indicates they are too far off-policy</li>
</ul>
</section>
<section id="replay-buffer" class="level5">
<h5 class="anchored" data-anchor-id="replay-buffer">Replay Buffer</h5>
<p>The replay buffer caches rollout groups that had learning signal (non-zero reward variance) and uses them to replace zero-signal groups in later batches.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb50"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb50-1"><a href="#cb50-1" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb50-2"><a href="#cb50-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">replay_buffer_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">100</span><span class="co"> # Max cached groups (0 = disabled)</span></span>
<span id="cb50-3"><a href="#cb50-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">replay_recompute_logps</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Recompute log-probs for replayed data (recommended)</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Note
</div>
</div>
<div class="callout-body-container callout-body">
<p>When <code>replay_recompute_logps: true</code> (default), old log-probabilities are recomputed using the current model weights. This fixes the IS mismatch that would otherwise occur when replaying stale data.</p>
</div>
</div>
</section>
<section id="deferred-re-rolling" class="level5">
<h5 class="anchored" data-anchor-id="deferred-re-rolling">Deferred Re-rolling</h5>
<p>Failed prompts (where the model produces zero reward for all generations) are buffered and re-injected into later batches when the model may be better equipped to solve them.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb51"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb51-1"><a href="#cb51-1" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb51-2"><a href="#cb51-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reroll_start_fraction</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.5</span><span class="co"> # Start re-rolling after 50% of training</span></span>
<span id="cb51-3"><a href="#cb51-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reroll_max_groups</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span><span class="co"> # Max groups to replace per batch</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="zero-advantage-batch-skipping" class="level5">
<h5 class="anchored" data-anchor-id="zero-advantage-batch-skipping">Zero-Advantage Batch Skipping</h5>
<p>When all advantages in a micro-batch are zero (no learning signal), the forward/backward pass is skipped entirely. This is enabled by default and logged as <code>skipped_zero_adv_batches=1</code>.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb52"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb52-2"><a href="#cb52-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">skip_zero_advantage_batches</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # default</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="parallel-reward-workers" class="level5">
<h5 class="anchored" data-anchor-id="parallel-reward-workers">Parallel Reward Workers</h5>
<p>Reward functions that use <code>signal.alarm()</code> (e.g., <code>math_verify</code>) must run in the main thread. Parallel reward workers use subprocesses to work around this limitation while enabling concurrent reward computation.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb53"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb53-1"><a href="#cb53-1" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb53-2"><a href="#cb53-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reward_num_workers</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span><span class="co"> # Number of subprocess workers (1 = no parallelism)</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="full-async-grpo-example" class="level5">
<h5 class="anchored" data-anchor-id="full-async-grpo-example">Full Async GRPO Example</h5>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb54"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> Qwen/Qwen2.5-1.5B-Instruct</span></span>
<span id="cb54-2"><a href="#cb54-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb54-3"><a href="#cb54-3" aria-hidden="true" tabindex="-1"></a><span class="fu">vllm</span><span class="kw">:</span></span>
<span id="cb54-4"><a href="#cb54-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">host</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.0.0.0</span></span>
<span id="cb54-5"><a href="#cb54-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">port</span><span class="kw">:</span><span class="at"> </span><span class="dv">8000</span></span>
<span id="cb54-6"><a href="#cb54-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">gpu_memory_utilization</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.35</span></span>
<span id="cb54-7"><a href="#cb54-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">dtype</span><span class="kw">:</span><span class="at"> auto</span></span>
<span id="cb54-8"><a href="#cb54-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb54-9"><a href="#cb54-9" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> lora</span></span>
<span id="cb54-10"><a href="#cb54-10" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> </span><span class="dv">32</span></span>
<span id="cb54-11"><a href="#cb54-11" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> </span><span class="dv">64</span></span>
<span id="cb54-12"><a href="#cb54-12" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_linear</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb54-13"><a href="#cb54-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb54-14"><a href="#cb54-14" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> grpo</span></span>
<span id="cb54-15"><a href="#cb54-15" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb54-16"><a href="#cb54-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_data_producer</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb54-17"><a href="#cb54-17" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_vllm</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb54-18"><a href="#cb54-18" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">async_prefetch</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb54-19"><a href="#cb54-19" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">prefetch_depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span></span>
<span id="cb54-20"><a href="#cb54-20" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_sync_interval</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
<span id="cb54-21"><a href="#cb54-21" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_lora_sync</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb54-22"><a href="#cb54-22" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">streaming_partial_batch</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb54-23"><a href="#cb54-23" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_importance_sampling_correction</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb54-24"><a href="#cb54-24" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">off_policy_mask_threshold</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.5</span></span>
<span id="cb54-25"><a href="#cb54-25" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">importance_sampling_level</span><span class="kw">:</span><span class="at"> token</span></span>
<span id="cb54-26"><a href="#cb54-26" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">num_generations</span><span class="kw">:</span><span class="at"> </span><span class="dv">8</span></span>
<span id="cb54-27"><a href="#cb54-27" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">max_completion_length</span><span class="kw">:</span><span class="at"> </span><span class="dv">512</span></span>
<span id="cb54-28"><a href="#cb54-28" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reward_funcs</span><span class="kw">:</span></span>
<span id="cb54-29"><a href="#cb54-29" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> rewards.accuracy_reward</span></span>
<span id="cb54-30"><a href="#cb54-30" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reroll_start_fraction</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.5</span></span>
<span id="cb54-31"><a href="#cb54-31" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">replay_buffer_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">100</span></span>
<span id="cb54-32"><a href="#cb54-32" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reward_num_workers</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span></span>
<span id="cb54-33"><a href="#cb54-33" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">skip_zero_advantage_batches</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb54-34"><a href="#cb54-34" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb54-35"><a href="#cb54-35" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb54-36"><a href="#cb54-36" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> AI-MO/NuminaMath-TIR</span></span>
<span id="cb54-37"><a href="#cb54-37" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> rewards.prompt_transform</span></span>
<span id="cb54-38"><a href="#cb54-38" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train</span></span>
<span id="cb54-39"><a href="#cb54-39" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb54-40"><a href="#cb54-40" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_accumulation_steps</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span></span>
<span id="cb54-41"><a href="#cb54-41" aria-hidden="true" tabindex="-1"></a><span class="fu">micro_batch_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
<span id="cb54-42"><a href="#cb54-42" aria-hidden="true" tabindex="-1"></a><span class="fu">max_steps</span><span class="kw">:</span><span class="at"> </span><span class="dv">500</span></span>
<span id="cb54-43"><a href="#cb54-43" aria-hidden="true" tabindex="-1"></a><span class="fu">learning_rate</span><span class="kw">:</span><span class="at"> </span><span class="fl">1e-5</span></span>
<span id="cb54-44"><a href="#cb54-44" aria-hidden="true" tabindex="-1"></a><span class="fu">bf16</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb54-45"><a href="#cb54-45" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb55"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb55-1"><a href="#cb55-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Terminal 1: Start vLLM on GPU 0</span></span>
<span id="cb55-2"><a href="#cb55-2" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>0 <span class="ex">axolotl</span> vllm-serve config.yaml</span>
<span id="cb55-3"><a href="#cb55-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb55-4"><a href="#cb55-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Terminal 2: Train on GPU 1</span></span>
<span id="cb55-5"><a href="#cb55-5" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>1 <span class="ex">axolotl</span> train config.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="multi-gpu-async-grpo" class="level5">
<h5 class="anchored" data-anchor-id="multi-gpu-async-grpo">Multi-GPU Async GRPO</h5>
<p>Async GRPO supports FSDP and DeepSpeed ZeRO-3 for multi-GPU training. vLLM runs on one GPU while training is distributed across the remaining GPUs.</p>
<p><strong>FSDP:</strong></p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb56"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb56-1"><a href="#cb56-1" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span></span>
<span id="cb56-2"><a href="#cb56-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> full_shard</span></span>
<span id="cb56-3"><a href="#cb56-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> auto_wrap</span></span>
<span id="cb56-4"><a href="#cb56-4" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
<span id="cb56-5"><a href="#cb56-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">fsdp_transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> Qwen2DecoderLayer</span></span>
<span id="cb56-6"><a href="#cb56-6" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing_kwargs</span><span class="kw">:</span></span>
<span id="cb56-7"><a href="#cb56-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_reentrant</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p><strong>DeepSpeed ZeRO-3:</strong></p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb57"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb57-1"><a href="#cb57-1" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> deepspeed_configs/zero3_bf16.json</span></span>
<span id="cb57-2"><a href="#cb57-2" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing_kwargs</span><span class="kw">:</span></span>
<span id="cb57-3"><a href="#cb57-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_reentrant</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Required for ZeRO-3</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb58"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb58-1"><a href="#cb58-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Terminal 1: Start vLLM on GPU 0</span></span>
<span id="cb58-2"><a href="#cb58-2" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>0 <span class="ex">axolotl</span> vllm-serve config.yaml</span>
<span id="cb58-3"><a href="#cb58-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb58-4"><a href="#cb58-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Terminal 2: Train on GPUs 0,1</span></span>
<span id="cb58-5"><a href="#cb58-5" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>0,1 <span class="ex">accelerate</span> launch <span class="at">--num_processes</span> 2 <span class="at">-m</span> axolotl.cli.train config.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="callout callout-style-default callout-important callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Important
</div>
</div>
<div class="callout-body-container callout-body">
<p>With multi-GPU async prefetch, only rank 0 generates completions in the background thread. Results are broadcast to all ranks on the main thread. This avoids FSDP/DeepSpeed collective deadlocks from unsynchronized background threads.</p>
</div>
</div>
</section>
</section>
</section>
<section id="gdpo" class="level3">
<h3 class="anchored" data-anchor-id="gdpo">GDPO</h3>
<p>GDPO (Group Reward-Decoupled Policy Optimization) extends GRPO for multi-reward training. It addresses the <strong>reward advantage collapse</strong> problem by normalizing each reward function independently before combining them.</p>
<div class="callout callout-style-default callout-tip callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>Use GDPO when training with multiple reward functions. For single reward, GRPO and GDPO produce equivalent results.</p>
</div>
</div>
<p>Paper: <a href="https://arxiv.org/pdf/2501.05242">https://arxiv.org/pdf/2501.05242</a></p>
<p>GDPO uses TRLs native <code>multi_objective_aggregation</code> parameter under the hood. When you set <code>rl: gdpo</code>, axolotl automatically configures TRL to use <code>normalize_then_sum</code> aggregation.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb59"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb59-1"><a href="#cb59-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> Qwen/Qwen2.5-1.5B-Instruct</span></span>
<span id="cb59-2"><a href="#cb59-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb59-3"><a href="#cb59-3" aria-hidden="true" tabindex="-1"></a><span class="fu">vllm</span><span class="kw">:</span></span>
<span id="cb59-4"><a href="#cb59-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">host</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.0.0.0</span></span>
<span id="cb59-5"><a href="#cb59-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">port</span><span class="kw">:</span><span class="at"> </span><span class="dv">8000</span></span>
<span id="cb59-6"><a href="#cb59-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">tensor_parallel_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
<span id="cb59-7"><a href="#cb59-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">gpu_memory_utilization</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.85</span></span>
<span id="cb59-8"><a href="#cb59-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb59-9"><a href="#cb59-9" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> gdpo</span></span>
<span id="cb59-10"><a href="#cb59-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb59-11"><a href="#cb59-11" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb59-12"><a href="#cb59-12" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">beta</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.001</span></span>
<span id="cb59-13"><a href="#cb59-13" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">max_completion_length</span><span class="kw">:</span><span class="at"> </span><span class="dv">256</span></span>
<span id="cb59-14"><a href="#cb59-14" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_vllm</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb59-15"><a href="#cb59-15" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">num_generations</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span></span>
<span id="cb59-16"><a href="#cb59-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reward_funcs</span><span class="kw">:</span></span>
<span id="cb59-17"><a href="#cb59-17" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> rewards.format_reward</span></span>
<span id="cb59-18"><a href="#cb59-18" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> rewards.correctness_reward</span></span>
<span id="cb59-19"><a href="#cb59-19" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reward_weights</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="fl">1.0</span><span class="kw">,</span><span class="at"> </span><span class="fl">2.0</span><span class="kw">]</span></span>
<span id="cb59-20"><a href="#cb59-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb59-21"><a href="#cb59-21" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb59-22"><a href="#cb59-22" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> openai/gsm8k</span></span>
<span id="cb59-23"><a href="#cb59-23" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> main</span></span>
<span id="cb59-24"><a href="#cb59-24" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> rewards.oai_gsm8k_transform</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>You can also use GRPO with explicit aggregation control:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb60"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb60-1"><a href="#cb60-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> grpo</span></span>
<span id="cb60-2"><a href="#cb60-2" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb60-3"><a href="#cb60-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">multi_objective_aggregation</span><span class="kw">:</span><span class="at"> normalize_then_sum</span><span class="co"> # GDPO behavior</span></span>
<span id="cb60-4"><a href="#cb60-4" aria-hidden="true" tabindex="-1"></a><span class="co"> # or: sum_then_normalize # Default GRPO behavior</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<section id="gdpo-vs-grpo" class="level4">
<h4 class="anchored" data-anchor-id="gdpo-vs-grpo">GDPO vs GRPO</h4>
<table class="caption-top table">
<colgroup>
<col style="width: 40%">
<col style="width: 30%">
<col style="width: 30%">
</colgroup>
<thead>
<tr class="header">
<th>Aspect</th>
<th>GRPO</th>
<th>GDPO</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><strong>Aggregation</strong></td>
<td><code>sum_then_normalize</code></td>
<td><code>normalize_then_sum</code></td>
</tr>
<tr class="even">
<td><strong>Multi-reward</strong></td>
<td>May collapse advantages</td>
<td>Preserves reward signals</td>
</tr>
<tr class="odd">
<td><strong>Single reward</strong></td>
<td>Standard behavior</td>
<td>Equivalent to GRPO</td>
</tr>
</tbody>
</table>
</section>
<section id="why-gdpo" class="level4">
<h4 class="anchored" data-anchor-id="why-gdpo">Why GDPO?</h4>
<p>When using multiple rewards with GRPO, different reward combinations can produce identical advantages:</p>
<pre><code># Example: format + correctness rewards
[format=0, correct=3] → sum=3
[format=1, correct=2] → sum=3 ← GRPO sees these as equal!
[format=2, correct=1] → sum=3
[format=3, correct=0] → sum=3</code></pre>
<p>GDPO normalizes each reward independently, preserving their relative differences.</p>
</section>
<section id="reward-functions-1" class="level4">
<h4 class="anchored" data-anchor-id="reward-functions-1">Reward Functions</h4>
<p>GDPO uses the same reward function format as GRPO:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb62"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb62-1"><a href="#cb62-1" aria-hidden="true" tabindex="-1"></a><span class="co"># rewards.py</span></span>
<span id="cb62-2"><a href="#cb62-2" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> format_reward(completions, <span class="op">**</span>kwargs) <span class="op">-&gt;</span> <span class="bu">list</span>[<span class="bu">float</span>]:</span>
<span id="cb62-3"><a href="#cb62-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> [<span class="fl">1.0</span> <span class="cf">if</span> <span class="bu">len</span>(c) <span class="op">&gt;</span> <span class="dv">10</span> <span class="cf">else</span> <span class="fl">0.0</span> <span class="cf">for</span> c <span class="kw">in</span> completions]</span>
<span id="cb62-4"><a href="#cb62-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb62-5"><a href="#cb62-5" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> correctness_reward(completions, answers, <span class="op">**</span>kwargs) <span class="op">-&gt;</span> <span class="bu">list</span>[<span class="bu">float</span>]:</span>
<span id="cb62-6"><a href="#cb62-6" aria-hidden="true" tabindex="-1"></a> rewards <span class="op">=</span> []</span>
<span id="cb62-7"><a href="#cb62-7" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> completion, answer <span class="kw">in</span> <span class="bu">zip</span>(completions, answers):</span>
<span id="cb62-8"><a href="#cb62-8" aria-hidden="true" tabindex="-1"></a> <span class="co"># Your scoring logic here</span></span>
<span id="cb62-9"><a href="#cb62-9" aria-hidden="true" tabindex="-1"></a> rewards.append(score)</span>
<span id="cb62-10"><a href="#cb62-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> rewards</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="sequence-parallelism" class="level4">
<h4 class="anchored" data-anchor-id="sequence-parallelism">Sequence Parallelism</h4>
<p>GDPO supports sequence parallelism for long-context training:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb63"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb63-1"><a href="#cb63-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> gdpo</span></span>
<span id="cb63-2"><a href="#cb63-2" aria-hidden="true" tabindex="-1"></a><span class="fu">context_parallel_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
</section>
<section id="simpo" class="level3">
<h3 class="anchored" data-anchor-id="simpo">SimPO</h3>
<p>SimPO uses <a href="https://huggingface.co/docs/trl/main/en/cpo_trainer">CPOTrainer</a> but with alternative loss function.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb64"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb64-1"><a href="#cb64-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> simpo</span></span>
<span id="cb64-2"><a href="#cb64-2" aria-hidden="true" tabindex="-1"></a><span class="fu">rl_beta</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.1</span><span class="co"> # default in CPOTrainer</span></span>
<span id="cb64-3"><a href="#cb64-3" aria-hidden="true" tabindex="-1"></a><span class="fu">cpo_alpha</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span><span class="co"> # default in CPOTrainer</span></span>
<span id="cb64-4"><a href="#cb64-4" aria-hidden="true" tabindex="-1"></a><span class="fu">simpo_gamma</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.5</span><span class="co"> # default in CPOTrainer</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>This method uses the same dataset format as <a href="#dpo">DPO</a>.</p>
</section>
<section id="ebft" class="level3">
<h3 class="anchored" data-anchor-id="ebft">EBFT</h3>
<p>EBFT (Energy-Based Fine-Tuning) fine-tunes language models by optimizing a <strong>feature-matching loss</strong> rather than relying on external reward functions. A frozen copy of the model extracts embeddings from both generated and ground-truth completions, and the generator is updated via REINFORCE to match the ground-truth feature moments.</p>
<p>Paper: <a href="https://arxiv.org/abs/2603.12248">“Matching Features, Not Tokens: Energy-Based Fine-Tuning of Language Models”</a> (Jelassi et al., 2026)</p>
<p><strong>Key advantages:</strong></p>
<ul>
<li>No reward model or verifier required — works on any (prompt, completion) data</li>
<li>Applicable to non-verifiable tasks (code, translation, creative writing)</li>
<li>Operates on model rollouts (not teacher forcing), reducing distribution shift</li>
</ul>
<p>EBFT supports two modes:</p>
<ul>
<li><strong>Structured mode</strong>: For QA/instruction data with prompt + completion pairs. Uses vLLM for generation (like GRPO).</li>
<li><strong>Strided mode</strong>: For unstructured text without prompt/completion splits. Uses strided block-parallel generation with flex_attention — no vLLM needed.</li>
</ul>
<section id="structured-mode" class="level4">
<h4 class="anchored" data-anchor-id="structured-mode">Structured Mode</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb65"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb65-1"><a href="#cb65-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> Qwen/Qwen3-4B</span></span>
<span id="cb65-2"><a href="#cb65-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb65-3"><a href="#cb65-3" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> ebft</span></span>
<span id="cb65-4"><a href="#cb65-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb65-5"><a href="#cb65-5" aria-hidden="true" tabindex="-1"></a><span class="fu">ebft</span><span class="kw">:</span></span>
<span id="cb65-6"><a href="#cb65-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">feature_layers</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="fl">0.25</span><span class="kw">,</span><span class="at"> </span><span class="fl">0.5</span><span class="kw">,</span><span class="at"> </span><span class="fl">0.75</span><span class="kw">]</span><span class="co"> # Extract features at 25%, 50%, 75% depth</span></span>
<span id="cb65-7"><a href="#cb65-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">embed_method</span><span class="kw">:</span><span class="at"> last_token</span></span>
<span id="cb65-8"><a href="#cb65-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_whitening</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
<span id="cb65-9"><a href="#cb65-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">alignment_coef</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span><span class="co"> # Cosine similarity reward weight</span></span>
<span id="cb65-10"><a href="#cb65-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">diversity_coef</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span><span class="co"> # Pairwise dot product penalty</span></span>
<span id="cb65-11"><a href="#cb65-11" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">ce_coef</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.0</span><span class="co"> # Cross-entropy on GT tokens (0 = off)</span></span>
<span id="cb65-12"><a href="#cb65-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb65-13"><a href="#cb65-13" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb65-14"><a href="#cb65-14" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">num_generations</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span></span>
<span id="cb65-15"><a href="#cb65-15" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">max_completion_length</span><span class="kw">:</span><span class="at"> </span><span class="dv">256</span></span>
<span id="cb65-16"><a href="#cb65-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">temperature</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.7</span></span>
<span id="cb65-17"><a href="#cb65-17" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_vllm</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb65-18"><a href="#cb65-18" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_server_host</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.0.0.0</span></span>
<span id="cb65-19"><a href="#cb65-19" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_server_port</span><span class="kw">:</span><span class="at"> </span><span class="dv">8000</span></span>
<span id="cb65-20"><a href="#cb65-20" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_lora_sync</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # LoRA adapter sync (recommended)</span></span>
<span id="cb65-21"><a href="#cb65-21" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_sync_interval</span><span class="kw">:</span><span class="at"> </span><span class="dv">3</span></span>
<span id="cb65-22"><a href="#cb65-22" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_data_producer</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb65-23"><a href="#cb65-23" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">async_prefetch</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Set false for sync mode</span></span>
<span id="cb65-24"><a href="#cb65-24" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">scale_rewards</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb65-25"><a href="#cb65-25" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">loss_type</span><span class="kw">:</span><span class="at"> grpo</span></span>
<span id="cb65-26"><a href="#cb65-26" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">epsilon</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.2</span></span>
<span id="cb65-27"><a href="#cb65-27" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb65-28"><a href="#cb65-28" aria-hidden="true" tabindex="-1"></a><span class="fu">vllm</span><span class="kw">:</span></span>
<span id="cb65-29"><a href="#cb65-29" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">gpu_memory_utilization</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.5</span></span>
<span id="cb65-30"><a href="#cb65-30" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">max_model_len</span><span class="kw">:</span><span class="at"> </span><span class="dv">2048</span></span>
<span id="cb65-31"><a href="#cb65-31" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb65-32"><a href="#cb65-32" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb65-33"><a href="#cb65-33" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> nvidia/OpenCodeInstruct</span></span>
<span id="cb65-34"><a href="#cb65-34" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> ebft_opencode.transform</span></span>
<span id="cb65-35"><a href="#cb65-35" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train[:500]</span></span>
<span id="cb65-36"><a href="#cb65-36" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb65-37"><a href="#cb65-37" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> lora</span></span>
<span id="cb65-38"><a href="#cb65-38" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> </span><span class="dv">16</span></span>
<span id="cb65-39"><a href="#cb65-39" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> </span><span class="dv">32</span></span>
<span id="cb65-40"><a href="#cb65-40" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_linear</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb66"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb66-1"><a href="#cb66-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Terminal 1: Start vLLM</span></span>
<span id="cb66-2"><a href="#cb66-2" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>0 <span class="ex">axolotl</span> vllm-serve config.yaml</span>
<span id="cb66-3"><a href="#cb66-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb66-4"><a href="#cb66-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Terminal 2: Train</span></span>
<span id="cb66-5"><a href="#cb66-5" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>1 <span class="ex">axolotl</span> train config.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="strided-mode" class="level4">
<h4 class="anchored" data-anchor-id="strided-mode">Strided Mode</h4>
<p>For unstructured text (raw code, prose). No vLLM needed — runs on a single GPU.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb67"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb67-1"><a href="#cb67-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> meta-llama/Llama-3.2-1B</span></span>
<span id="cb67-2"><a href="#cb67-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb67-3"><a href="#cb67-3" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> ebft</span></span>
<span id="cb67-4"><a href="#cb67-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb67-5"><a href="#cb67-5" aria-hidden="true" tabindex="-1"></a><span class="fu">ebft</span><span class="kw">:</span></span>
<span id="cb67-6"><a href="#cb67-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">mode</span><span class="kw">:</span><span class="at"> strided</span></span>
<span id="cb67-7"><a href="#cb67-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">stride</span><span class="kw">:</span><span class="at"> </span><span class="dv">8</span></span>
<span id="cb67-8"><a href="#cb67-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">context_length</span><span class="kw">:</span><span class="at"> </span><span class="dv">8</span></span>
<span id="cb67-9"><a href="#cb67-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">generate_max_len</span><span class="kw">:</span><span class="at"> </span><span class="dv">8</span></span>
<span id="cb67-10"><a href="#cb67-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">n_samples_per_prompt</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span></span>
<span id="cb67-11"><a href="#cb67-11" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">temperature</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.6</span></span>
<span id="cb67-12"><a href="#cb67-12" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">feature_layers</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="fl">0.25</span><span class="kw">,</span><span class="at"> </span><span class="fl">0.5</span><span class="kw">,</span><span class="at"> </span><span class="fl">0.75</span><span class="kw">]</span></span>
<span id="cb67-13"><a href="#cb67-13" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">embed_method</span><span class="kw">:</span><span class="at"> last_token</span></span>
<span id="cb67-14"><a href="#cb67-14" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_whitening</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb67-15"><a href="#cb67-15" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">alignment_coef</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span></span>
<span id="cb67-16"><a href="#cb67-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">diversity_coef</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span></span>
<span id="cb67-17"><a href="#cb67-17" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">rl_coef</span><span class="kw">:</span><span class="at"> </span><span class="fl">1.0</span></span>
<span id="cb67-18"><a href="#cb67-18" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">ce_coef</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.03</span></span>
<span id="cb67-19"><a href="#cb67-19" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">advantage_estimator</span><span class="kw">:</span><span class="at"> rloo</span></span>
<span id="cb67-20"><a href="#cb67-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb67-21"><a href="#cb67-21" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb67-22"><a href="#cb67-22" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> nvidia/OpenCodeInstruct</span></span>
<span id="cb67-23"><a href="#cb67-23" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> ebft_strided_structured.transform</span></span>
<span id="cb67-24"><a href="#cb67-24" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train[:1%]</span></span>
<span id="cb67-25"><a href="#cb67-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb67-26"><a href="#cb67-26" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attention</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
<span id="cb67-27"><a href="#cb67-27" aria-hidden="true" tabindex="-1"></a><span class="fu">flex_attention</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Strided mode uses flex_attention</span></span>
<span id="cb67-28"><a href="#cb67-28" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb67-29"><a href="#cb67-29" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing_kwargs</span><span class="kw">:</span></span>
<span id="cb67-30"><a href="#cb67-30" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_reentrant</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Required for flex_attention</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb68"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb68-1"><a href="#cb68-1" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>0 <span class="ex">axolotl</span> train config.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="callout callout-style-default callout-tip callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>See <code>examples/ebft/</code> for complete example configs covering Llama 1B/3B/8B and Qwen3 4B/8B models in both modes.</p>
</div>
</div>
</section>
<section id="ebft-configuration-reference" class="level4">
<h4 class="anchored" data-anchor-id="ebft-configuration-reference">EBFT Configuration Reference</h4>
<table class="caption-top table">
<colgroup>
<col style="width: 33%">
<col style="width: 27%">
<col style="width: 39%">
</colgroup>
<thead>
<tr class="header">
<th>Parameter</th>
<th>Default</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><code>ebft.feature_layers</code></td>
<td><code>[0.25, 0.5, 0.75]</code></td>
<td>Layer depths for feature extraction (fractional)</td>
</tr>
<tr class="even">
<td><code>ebft.embed_method</code></td>
<td><code>last_token</code></td>
<td>Feature pooling: <code>last_token</code>, <code>mean_pooling</code>, <code>concat</code></td>
</tr>
<tr class="odd">
<td><code>ebft.use_whitening</code></td>
<td><code>false</code></td>
<td>SVD whitening of feature dimensions</td>
</tr>
<tr class="even">
<td><code>ebft.alignment_coef</code></td>
<td><code>1.0</code></td>
<td>Cosine similarity reward weight</td>
</tr>
<tr class="odd">
<td><code>ebft.diversity_coef</code></td>
<td><code>1.0</code></td>
<td>Pairwise dot product penalty weight</td>
</tr>
<tr class="even">
<td><code>ebft.ce_coef</code></td>
<td><code>0.0</code></td>
<td>Cross-entropy loss on ground-truth tokens</td>
</tr>
<tr class="odd">
<td><code>ebft.mode</code></td>
<td><code>structured</code></td>
<td><code>structured</code> (vLLM) or <code>strided</code> (no vLLM)</td>
</tr>
<tr class="even">
<td><code>ebft.stride</code></td>
<td></td>
<td>Tokens between anchor points (strided mode)</td>
</tr>
<tr class="odd">
<td><code>ebft.context_length</code></td>
<td></td>
<td>Context window per block (strided mode)</td>
</tr>
<tr class="even">
<td><code>ebft.generate_max_len</code></td>
<td></td>
<td>Tokens to generate per block (strided mode)</td>
</tr>
<tr class="odd">
<td><code>ebft.n_samples_per_prompt</code></td>
<td></td>
<td>Rollouts per document (strided mode)</td>
</tr>
<tr class="even">
<td><code>ebft.advantage_estimator</code></td>
<td><code>grpo</code></td>
<td><code>grpo</code> or <code>rloo</code> (strided mode)</td>
</tr>
</tbody>
</table>
</section>
</section>
<section id="nemo-gym-integration" class="level3">
<h3 class="anchored" data-anchor-id="nemo-gym-integration">NeMo Gym Integration</h3>
<p><a href="https://github.com/NVIDIA-NeMo/Gym">NeMo Gym</a> provides 50+ verified RL environments (math, coding, tool-use, reasoning) with deterministic reward signals. The axolotl integration supports both <strong>single-turn</strong> (call <code>/verify</code> after generation) and <strong>multi-turn</strong> (agent-based tool execution via <code>/run</code>).</p>
<section id="single-turn-simplest" class="level4">
<h4 class="anchored" data-anchor-id="single-turn-simplest">Single-Turn (Simplest)</h4>
<p>For environments that only need answer verification (math, coding challenges). No agent server needed — the reward function calls <code>/verify</code> directly on the resource server.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb69"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb69-1"><a href="#cb69-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> Qwen/Qwen2.5-0.5B-Instruct</span></span>
<span id="cb69-2"><a href="#cb69-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb69-3"><a href="#cb69-3" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> grpo</span></span>
<span id="cb69-4"><a href="#cb69-4" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> tokenizer_default</span></span>
<span id="cb69-5"><a href="#cb69-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb69-6"><a href="#cb69-6" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb69-7"><a href="#cb69-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_vllm</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span><span class="co"> # Colocate mode (single GPU)</span></span>
<span id="cb69-8"><a href="#cb69-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">num_generations</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span></span>
<span id="cb69-9"><a href="#cb69-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">max_completion_length</span><span class="kw">:</span><span class="at"> </span><span class="dv">128</span></span>
<span id="cb69-10"><a href="#cb69-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">temperature</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.9</span></span>
<span id="cb69-11"><a href="#cb69-11" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reward_funcs</span><span class="kw">:</span></span>
<span id="cb69-12"><a href="#cb69-12" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> axolotl.integrations.nemo_gym.rewards.reward_nemo_gym_verify</span></span>
<span id="cb69-13"><a href="#cb69-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb69-14"><a href="#cb69-14" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span></span>
<span id="cb69-15"><a href="#cb69-15" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> axolotl.integrations.nemo_gym.NemoGymPlugin</span></span>
<span id="cb69-16"><a href="#cb69-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb69-17"><a href="#cb69-17" aria-hidden="true" tabindex="-1"></a><span class="fu">nemo_gym_enabled</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb69-18"><a href="#cb69-18" aria-hidden="true" tabindex="-1"></a><span class="fu">nemo_gym_dir</span><span class="kw">:</span><span class="at"> ~/Gym</span></span>
<span id="cb69-19"><a href="#cb69-19" aria-hidden="true" tabindex="-1"></a><span class="fu">nemo_gym_auto_start</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
<span id="cb69-20"><a href="#cb69-20" aria-hidden="true" tabindex="-1"></a><span class="fu">nemo_gym_head_port</span><span class="kw">:</span><span class="at"> </span><span class="dv">11000</span></span>
<span id="cb69-21"><a href="#cb69-21" aria-hidden="true" tabindex="-1"></a><span class="fu">nemo_gym_datasets</span><span class="kw">:</span></span>
<span id="cb69-22"><a href="#cb69-22" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> resources_servers/reasoning_gym/data/train_basic_arithmetic.jsonl</span></span>
<span id="cb69-23"><a href="#cb69-23" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">server_name</span><span class="kw">:</span><span class="at"> reasoning_gym</span></span>
<span id="cb69-24"><a href="#cb69-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb69-25"><a href="#cb69-25" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb69-26"><a href="#cb69-26" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> ~/Gym/resources_servers/reasoning_gym/data/train_basic_arithmetic.jsonl</span></span>
<span id="cb69-27"><a href="#cb69-27" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> chat_template</span></span>
<span id="cb69-28"><a href="#cb69-28" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> responses_create_params.input</span></span>
<span id="cb69-29"><a href="#cb69-29" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">message_field_content</span><span class="kw">:</span><span class="at"> content</span></span>
<span id="cb69-30"><a href="#cb69-30" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">message_field_role</span><span class="kw">:</span><span class="at"> role</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb70"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb70-1"><a href="#cb70-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Terminal 1: Start NeMo Gym resource server</span></span>
<span id="cb70-2"><a href="#cb70-2" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> ~/Gym <span class="kw">&amp;&amp;</span> <span class="ex">.venv/bin/ng_run</span> <span class="dt">\</span></span>
<span id="cb70-3"><a href="#cb70-3" aria-hidden="true" tabindex="-1"></a> <span class="st">"+config_paths=[resources_servers/reasoning_gym/configs/resources_only.yaml]"</span> <span class="dt">\</span></span>
<span id="cb70-4"><a href="#cb70-4" aria-hidden="true" tabindex="-1"></a> <span class="st">"+skip_venv_if_present=true"</span></span>
<span id="cb70-5"><a href="#cb70-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb70-6"><a href="#cb70-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Terminal 2: Train</span></span>
<span id="cb70-7"><a href="#cb70-7" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>0 <span class="ex">axolotl</span> train config.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Note
</div>
</div>
<div class="callout-body-container callout-body">
<p><code>nemo_gym_datasets.path</code> is relative to <code>nemo_gym_dir</code>. Dont use absolute paths or they will be double-joined.</p>
</div>
</div>
</section>
<section id="multi-turn-with-async-grpo-recommended" class="level4">
<h4 class="anchored" data-anchor-id="multi-turn-with-async-grpo-recommended">Multi-Turn with Async GRPO (Recommended)</h4>
<p>For environments with tool-use (weather, search, databases). An agent server orchestrates multi-turn interactions: generate → parse tool calls → execute tools → feed results back → repeat until done.</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb71"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb71-1"><a href="#cb71-1" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> Qwen/Qwen3-0.6B</span></span>
<span id="cb71-2"><a href="#cb71-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb71-3"><a href="#cb71-3" aria-hidden="true" tabindex="-1"></a><span class="fu">rl</span><span class="kw">:</span><span class="at"> grpo</span></span>
<span id="cb71-4"><a href="#cb71-4" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> tokenizer_default</span></span>
<span id="cb71-5"><a href="#cb71-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb71-6"><a href="#cb71-6" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> lora</span></span>
<span id="cb71-7"><a href="#cb71-7" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> </span><span class="dv">16</span></span>
<span id="cb71-8"><a href="#cb71-8" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> </span><span class="dv">32</span></span>
<span id="cb71-9"><a href="#cb71-9" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_modules</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="at">q_proj</span><span class="kw">,</span><span class="at"> k_proj</span><span class="kw">,</span><span class="at"> v_proj</span><span class="kw">,</span><span class="at"> o_proj</span><span class="kw">,</span><span class="at"> gate_proj</span><span class="kw">,</span><span class="at"> up_proj</span><span class="kw">,</span><span class="at"> down_proj</span><span class="kw">]</span></span>
<span id="cb71-10"><a href="#cb71-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb71-11"><a href="#cb71-11" aria-hidden="true" tabindex="-1"></a><span class="fu">trl</span><span class="kw">:</span></span>
<span id="cb71-12"><a href="#cb71-12" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_vllm</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb71-13"><a href="#cb71-13" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_mode</span><span class="kw">:</span><span class="at"> server</span></span>
<span id="cb71-14"><a href="#cb71-14" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_server_host</span><span class="kw">:</span><span class="at"> localhost</span></span>
<span id="cb71-15"><a href="#cb71-15" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_server_port</span><span class="kw">:</span><span class="at"> </span><span class="dv">8000</span></span>
<span id="cb71-16"><a href="#cb71-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_lora_sync</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb71-17"><a href="#cb71-17" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">vllm_sync_interval</span><span class="kw">:</span><span class="at"> </span><span class="dv">5</span></span>
<span id="cb71-18"><a href="#cb71-18" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">use_data_producer</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb71-19"><a href="#cb71-19" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">async_prefetch</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # 3x speedup</span></span>
<span id="cb71-20"><a href="#cb71-20" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">num_generations</span><span class="kw">:</span><span class="at"> </span><span class="dv">4</span></span>
<span id="cb71-21"><a href="#cb71-21" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">max_completion_length</span><span class="kw">:</span><span class="at"> </span><span class="dv">512</span></span>
<span id="cb71-22"><a href="#cb71-22" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">temperature</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.8</span></span>
<span id="cb71-23"><a href="#cb71-23" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">reward_funcs</span><span class="kw">:</span></span>
<span id="cb71-24"><a href="#cb71-24" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> axolotl.integrations.nemo_gym.rewards.reward_env</span></span>
<span id="cb71-25"><a href="#cb71-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb71-26"><a href="#cb71-26" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span></span>
<span id="cb71-27"><a href="#cb71-27" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> axolotl.integrations.nemo_gym.NemoGymPlugin</span></span>
<span id="cb71-28"><a href="#cb71-28" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb71-29"><a href="#cb71-29" aria-hidden="true" tabindex="-1"></a><span class="fu">nemo_gym_enabled</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb71-30"><a href="#cb71-30" aria-hidden="true" tabindex="-1"></a><span class="fu">nemo_gym_auto_start</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
<span id="cb71-31"><a href="#cb71-31" aria-hidden="true" tabindex="-1"></a><span class="fu">nemo_gym_head_port</span><span class="kw">:</span><span class="at"> </span><span class="dv">11000</span></span>
<span id="cb71-32"><a href="#cb71-32" aria-hidden="true" tabindex="-1"></a><span class="fu">nemo_gym_multi_turn</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb71-33"><a href="#cb71-33" aria-hidden="true" tabindex="-1"></a><span class="fu">nemo_gym_verify_timeout</span><span class="kw">:</span><span class="at"> </span><span class="dv">120</span></span>
<span id="cb71-34"><a href="#cb71-34" aria-hidden="true" tabindex="-1"></a><span class="fu">nemo_gym_datasets</span><span class="kw">:</span></span>
<span id="cb71-35"><a href="#cb71-35" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> resources_servers/example_single_tool_call/data/weather_tool_calling.jsonl</span></span>
<span id="cb71-36"><a href="#cb71-36" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">server_name</span><span class="kw">:</span><span class="at"> example_single_tool_call</span></span>
<span id="cb71-37"><a href="#cb71-37" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb71-38"><a href="#cb71-38" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb71-39"><a href="#cb71-39" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> ~/Gym/resources_servers/example_single_tool_call/data/weather_tool_calling.jsonl</span></span>
<span id="cb71-40"><a href="#cb71-40" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> chat_template</span></span>
<span id="cb71-41"><a href="#cb71-41" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> responses_create_params.input</span></span>
<span id="cb71-42"><a href="#cb71-42" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">message_field_content</span><span class="kw">:</span><span class="at"> content</span></span>
<span id="cb71-43"><a href="#cb71-43" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">message_field_role</span><span class="kw">:</span><span class="at"> role</span></span>
<span id="cb71-44"><a href="#cb71-44" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb71-45"><a href="#cb71-45" aria-hidden="true" tabindex="-1"></a><span class="fu">vllm</span><span class="kw">:</span></span>
<span id="cb71-46"><a href="#cb71-46" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">gpu_memory_utilization</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.85</span></span>
<span id="cb71-47"><a href="#cb71-47" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">max_model_len</span><span class="kw">:</span><span class="at"> </span><span class="dv">2048</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<p>Multi-turn requires three services running:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb72"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb72-1"><a href="#cb72-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Terminal 1: vLLM with LoRA + tool calling</span></span>
<span id="cb72-2"><a href="#cb72-2" aria-hidden="true" tabindex="-1"></a><span class="va">VLLM_ALLOW_RUNTIME_LORA_UPDATING</span><span class="op">=</span>1 <span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>0 <span class="dt">\</span></span>
<span id="cb72-3"><a href="#cb72-3" aria-hidden="true" tabindex="-1"></a> <span class="ex">python</span> <span class="at">-m</span> vllm.entrypoints.openai.api_server <span class="dt">\</span></span>
<span id="cb72-4"><a href="#cb72-4" aria-hidden="true" tabindex="-1"></a> <span class="at">--model</span> Qwen/Qwen3-0.6B <span class="at">--max-model-len</span> 2048 <span class="dt">\</span></span>
<span id="cb72-5"><a href="#cb72-5" aria-hidden="true" tabindex="-1"></a> <span class="at">--gpu-memory-utilization</span> 0.85 <span class="dt">\</span></span>
<span id="cb72-6"><a href="#cb72-6" aria-hidden="true" tabindex="-1"></a> <span class="at">--enable-lora</span> <span class="at">--max-lora-rank</span> 64 <span class="dt">\</span></span>
<span id="cb72-7"><a href="#cb72-7" aria-hidden="true" tabindex="-1"></a> <span class="at">--enable-auto-tool-choice</span> <span class="at">--tool-call-parser</span> hermes</span>
<span id="cb72-8"><a href="#cb72-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb72-9"><a href="#cb72-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Terminal 2: NeMo Gym servers (resource + model proxy + agent)</span></span>
<span id="cb72-10"><a href="#cb72-10" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> ~/Gym <span class="kw">&amp;&amp;</span> <span class="ex">.venv/bin/ng_run</span> <span class="dt">\</span></span>
<span id="cb72-11"><a href="#cb72-11" aria-hidden="true" tabindex="-1"></a> <span class="st">"+config_paths=[configs/axolotl_tool_calling.yaml]"</span> <span class="dt">\</span></span>
<span id="cb72-12"><a href="#cb72-12" aria-hidden="true" tabindex="-1"></a> <span class="st">"+skip_venv_if_present=true"</span></span>
<span id="cb72-13"><a href="#cb72-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb72-14"><a href="#cb72-14" aria-hidden="true" tabindex="-1"></a><span class="co"># Terminal 3: Training</span></span>
<span id="cb72-15"><a href="#cb72-15" aria-hidden="true" tabindex="-1"></a><span class="va">CUDA_VISIBLE_DEVICES</span><span class="op">=</span>1 <span class="ex">axolotl</span> train config.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
<div class="callout callout-style-default callout-important callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Important
</div>
</div>
<div class="callout-body-container callout-body">
<p>Multi-turn requires a NeMo Gym agent config YAML that defines three components: a resource server (tools + <code>/verify</code>), a model server proxy (forwards to your vLLM), and an agent server (orchestrates <code>/run</code>). See the <a href="https://github.com/NVIDIA-NeMo/Gym">NeMo Gym README</a> for agent config format.</p>
</div>
</div>
</section>
<section id="nemo-gym-prerequisites" class="level4">
<h4 class="anchored" data-anchor-id="nemo-gym-prerequisites">NeMo Gym Prerequisites</h4>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb73"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb73-1"><a href="#cb73-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Clone and set up NeMo Gym</span></span>
<span id="cb73-2"><a href="#cb73-2" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> clone https://github.com/NVIDIA-NeMo/Gym.git ~/Gym</span>
<span id="cb73-3"><a href="#cb73-3" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> ~/Gym</span>
<span id="cb73-4"><a href="#cb73-4" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> venv <span class="at">--python</span> 3.12 <span class="kw">&amp;&amp;</span> <span class="bu">source</span> .venv/bin/activate <span class="kw">&amp;&amp;</span> <span class="ex">uv</span> sync</span>
<span id="cb73-5"><a href="#cb73-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb73-6"><a href="#cb73-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Fix pycosat build (GCC 13+)</span></span>
<span id="cb73-7"><a href="#cb73-7" aria-hidden="true" tabindex="-1"></a><span class="va">CFLAGS</span><span class="op">=</span><span class="st">""</span> <span class="ex">uv</span> pip install pycosat <span class="at">--python</span> .venv/bin/python <span class="at">--no-build-isolation</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="nemo-gym-configuration-reference" class="level4">
<h4 class="anchored" data-anchor-id="nemo-gym-configuration-reference">NeMo Gym Configuration Reference</h4>
<table class="caption-top table">
<colgroup>
<col style="width: 28%">
<col style="width: 15%">
<col style="width: 23%">
<col style="width: 33%">
</colgroup>
<thead>
<tr class="header">
<th>Parameter</th>
<th>Type</th>
<th>Default</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><code>nemo_gym_enabled</code></td>
<td>bool</td>
<td></td>
<td>Enable the NeMo Gym integration</td>
</tr>
<tr class="even">
<td><code>nemo_gym_dir</code></td>
<td>str</td>
<td><code>~/Gym</code></td>
<td>Path to NeMo Gym repo</td>
</tr>
<tr class="odd">
<td><code>nemo_gym_auto_start</code></td>
<td>bool</td>
<td><code>true</code></td>
<td>Auto-start resource servers</td>
</tr>
<tr class="even">
<td><code>nemo_gym_head_port</code></td>
<td>int</td>
<td><code>11000</code></td>
<td>Head server port</td>
</tr>
<tr class="odd">
<td><code>nemo_gym_multi_turn</code></td>
<td>bool</td>
<td><code>false</code></td>
<td>Enable multi-turn via agent <code>/run</code></td>
</tr>
<tr class="even">
<td><code>nemo_gym_verify_timeout</code></td>
<td>int</td>
<td><code>30</code></td>
<td>Per-request timeout (seconds)</td>
</tr>
<tr class="odd">
<td><code>nemo_gym_datasets</code></td>
<td>list</td>
<td>required</td>
<td>Dataset configs with <code>path</code> and <code>server_name</code></td>
</tr>
</tbody>
</table>
</section>
<section id="reward-functions-2" class="level4">
<h4 class="anchored" data-anchor-id="reward-functions-2">Reward Functions</h4>
<table class="caption-top table">
<colgroup>
<col style="width: 34%">
<col style="width: 20%">
<col style="width: 44%">
</colgroup>
<thead>
<tr class="header">
<th>Function</th>
<th>Mode</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><code>axolotl.integrations.nemo_gym.rewards.reward_nemo_gym_verify</code></td>
<td>Single-turn</td>
<td>Calls <code>/verify</code>, returns binary reward</td>
</tr>
<tr class="even">
<td><code>axolotl.integrations.nemo_gym.rewards.reward_env</code></td>
<td>Multi-turn</td>
<td>Passthrough reward from agent <code>/run</code></td>
</tr>
</tbody>
</table>
</section>
</section>
<section id="using-local-dataset-files" class="level3">
<h3 class="anchored" data-anchor-id="using-local-dataset-files">Using local dataset files</h3>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb74"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb74-1"><a href="#cb74-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
<span id="cb74-2"><a href="#cb74-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">ds_type</span><span class="kw">:</span><span class="at"> json</span></span>
<span id="cb74-3"><a href="#cb74-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">data_files</span><span class="kw">:</span></span>
<span id="cb74-4"><a href="#cb74-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> orca_rlhf.jsonl</span></span>
<span id="cb74-5"><a href="#cb74-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train</span></span>
<span id="cb74-6"><a href="#cb74-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> chatml.intel</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
<section id="trl-auto-unwrapping-for-peft" class="level3">
<h3 class="anchored" data-anchor-id="trl-auto-unwrapping-for-peft">TRL auto-unwrapping for PEFT</h3>
<p>TRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config:</p>
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb75"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb75-1"><a href="#cb75-1" aria-hidden="true" tabindex="-1"></a><span class="co"># load ref model when adapter training.</span></span>
<span id="cb75-2"><a href="#cb75-2" aria-hidden="true" tabindex="-1"></a><span class="fu">rl_adapter_ref_model</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
</section>
</section>
</main> <!-- /main -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const isCodeAnnotation = (el) => {
for (const clz of el.classList) {
if (clz.startsWith('code-annotation-')) {
return true;
}
}
return false;
}
const onCopySuccess = function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
let tooltip;
if (window.bootstrap) {
button.setAttribute("data-bs-toggle", "tooltip");
button.setAttribute("data-bs-placement", "left");
button.setAttribute("data-bs-title", "Copied!");
tooltip = new bootstrap.Tooltip(button,
{ trigger: "manual",
customClass: "code-copy-button-tooltip",
offset: [0, -8]});
tooltip.show();
}
setTimeout(function() {
if (tooltip) {
tooltip.hide();
button.removeAttribute("data-bs-title");
button.removeAttribute("data-bs-toggle");
button.removeAttribute("data-bs-placement");
}
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
}
const getTextToCopy = function(trigger) {
const outerScaffold = trigger.parentElement.cloneNode(true);
const codeEl = outerScaffold.querySelector('code');
for (const childEl of codeEl.children) {
if (isCodeAnnotation(childEl)) {
childEl.remove();
}
}
return codeEl.innerText;
}
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
text: getTextToCopy
});
clipboard.on('success', onCopySuccess);
if (window.document.getElementById('quarto-embedded-source-code-modal')) {
const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
text: getTextToCopy,
container: window.document.getElementById('quarto-embedded-source-code-modal')
});
clipboardModal.on('success', onCopySuccess);
}
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
var mailtoRegex = new RegExp(/^mailto:/);
var filterRegex = new RegExp("https:\/\/docs\.axolotl\.ai");
var isInternal = (href) => {
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
}
// Inspect non-navigation links and adorn them if external
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
for (var i=0; i<links.length; i++) {
const link = links[i];
if (!isInternal(link.href)) {
// undo the damage that might have been done by quarto-nav.js in the case of
// links that we want to consider external
if (link.dataset.originalHref !== undefined) {
link.href = link.dataset.originalHref;
}
}
}
function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
const config = {
allowHTML: true,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start',
};
if (contentFn) {
config.content = contentFn;
}
if (onTriggerFn) {
config.onTrigger = onTriggerFn;
}
if (onUntriggerFn) {
config.onUntrigger = onUntriggerFn;
}
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {
const ref = noterefs[i];
tippyHover(ref, function() {
// use id or data attribute instead here
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
if (note) {
return note.innerHTML;
} else {
return "";
}
});
}
const xrefs = window.document.querySelectorAll('a.quarto-xref');
const processXRef = (id, note) => {
// Strip column container classes
const stripColumnClz = (el) => {
el.classList.remove("page-full", "page-columns");
if (el.children) {
for (const child of el.children) {
stripColumnClz(child);
}
}
}
stripColumnClz(note)
if (id === null || id.startsWith('sec-')) {
// Special case sections, only their first couple elements
const container = document.createElement("div");
if (note.children && note.children.length > 2) {
container.appendChild(note.children[0].cloneNode(true));
for (let i = 1; i < note.children.length; i++) {
const child = note.children[i];
if (child.tagName === "P" && child.innerText === "") {
continue;
} else {
container.appendChild(child.cloneNode(true));
break;
}
}
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(container);
}
return container.innerHTML
} else {
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(note);
}
return note.innerHTML;
}
} else {
// Remove any anchor links if they are present
const anchorLink = note.querySelector('a.anchorjs-link');
if (anchorLink) {
anchorLink.remove();
}
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(note);
}
if (note.classList.contains("callout")) {
return note.outerHTML;
} else {
return note.innerHTML;
}
}
}
for (var i=0; i<xrefs.length; i++) {
const xref = xrefs[i];
tippyHover(xref, undefined, function(instance) {
instance.disable();
let url = xref.getAttribute('href');
let hash = undefined;
if (url.startsWith('#')) {
hash = url;
} else {
try { hash = new URL(url).hash; } catch {}
}
if (hash) {
const id = hash.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
if (note !== null) {
try {
const html = processXRef(id, note.cloneNode(true));
instance.setContent(html);
} finally {
instance.enable();
instance.show();
}
} else {
// See if we can fetch this
fetch(url.split('#')[0])
.then(res => res.text())
.then(html => {
const parser = new DOMParser();
const htmlDoc = parser.parseFromString(html, "text/html");
const note = htmlDoc.getElementById(id);
if (note !== null) {
const html = processXRef(id, note);
instance.setContent(html);
}
}).finally(() => {
instance.enable();
instance.show();
});
}
} else {
// See if we can fetch a full url (with no hash to target)
// This is a special case and we should probably do some content thinning / targeting
fetch(url)
.then(res => res.text())
.then(html => {
const parser = new DOMParser();
const htmlDoc = parser.parseFromString(html, "text/html");
const note = htmlDoc.querySelector('main.content');
if (note !== null) {
// This should only happen for chapter cross references
// (since there is no id in the URL)
// remove the first header
if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
note.children[0].remove();
}
const html = processXRef(null, note);
instance.setContent(html);
}
}).finally(() => {
instance.enable();
instance.show();
});
}
}, function(instance) {
});
}
let selectedAnnoteEl;
const selectorForAnnotation = ( cell, annotation) => {
let cellAttr = 'data-code-cell="' + cell + '"';
let lineAttr = 'data-code-annotation="' + annotation + '"';
const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
return selector;
}
const selectCodeLines = (annoteEl) => {
const doc = window.document;
const targetCell = annoteEl.getAttribute("data-target-cell");
const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
const lines = annoteSpan.getAttribute("data-code-lines").split(",");
const lineIds = lines.map((line) => {
return targetCell + "-" + line;
})
let top = null;
let height = null;
let parent = null;
if (lineIds.length > 0) {
//compute the position of the single el (top and bottom and make a div)
const el = window.document.getElementById(lineIds[0]);
top = el.offsetTop;
height = el.offsetHeight;
parent = el.parentElement.parentElement;
if (lineIds.length > 1) {
const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
const bottom = lastEl.offsetTop + lastEl.offsetHeight;
height = bottom - top;
}
if (top !== null && height !== null && parent !== null) {
// cook up a div (if necessary) and position it
let div = window.document.getElementById("code-annotation-line-highlight");
if (div === null) {
div = window.document.createElement("div");
div.setAttribute("id", "code-annotation-line-highlight");
div.style.position = 'absolute';
parent.appendChild(div);
}
div.style.top = top - 2 + "px";
div.style.height = height + 4 + "px";
div.style.left = 0;
let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
if (gutterDiv === null) {
gutterDiv = window.document.createElement("div");
gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
gutterDiv.style.position = 'absolute';
const codeCell = window.document.getElementById(targetCell);
const gutter = codeCell.querySelector('.code-annotation-gutter');
gutter.appendChild(gutterDiv);
}
gutterDiv.style.top = top - 2 + "px";
gutterDiv.style.height = height + 4 + "px";
}
selectedAnnoteEl = annoteEl;
}
};
const unselectCodeLines = () => {
const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
elementsIds.forEach((elId) => {
const div = window.document.getElementById(elId);
if (div) {
div.remove();
}
});
selectedAnnoteEl = undefined;
};
// Handle positioning of the toggle
window.addEventListener(
"resize",
throttle(() => {
elRect = undefined;
if (selectedAnnoteEl) {
selectCodeLines(selectedAnnoteEl);
}
}, 10)
);
function throttle(fn, ms) {
let throttle = false;
let timer;
return (...args) => {
if(!throttle) { // first call gets through
fn.apply(this, args);
throttle = true;
} else { // all the others get throttled
if(timer) clearTimeout(timer); // cancel #2
timer = setTimeout(() => {
fn.apply(this, args);
timer = throttle = false;
}, ms);
}
};
}
// Attach click handler to the DT
const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
for (const annoteDlNode of annoteDls) {
annoteDlNode.addEventListener('click', (event) => {
const clickedEl = event.target;
if (clickedEl !== selectedAnnoteEl) {
unselectCodeLines();
const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
if (activeEl) {
activeEl.classList.remove('code-annotation-active');
}
selectCodeLines(clickedEl);
clickedEl.classList.add('code-annotation-active');
} else {
// Unselect the line
unselectCodeLines();
clickedEl.classList.remove('code-annotation-active');
}
});
}
const findCites = (el) => {
const parentEl = el.parentElement;
if (parentEl) {
const cites = parentEl.dataset.cites;
if (cites) {
return {
el,
cites: cites.split(' ')
};
} else {
return findCites(el.parentElement)
}
} else {
return undefined;
}
};
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
for (var i=0; i<bibliorefs.length; i++) {
const ref = bibliorefs[i];
const citeInfo = findCites(ref);
if (citeInfo) {
tippyHover(citeInfo.el, function() {
var popup = window.document.createElement('div');
citeInfo.cites.forEach(function(cite) {
var citeDiv = window.document.createElement('div');
citeDiv.classList.add('hanging-indent');
citeDiv.classList.add('csl-entry');
var biblioDiv = window.document.getElementById('ref-' + cite);
if (biblioDiv) {
citeDiv.innerHTML = biblioDiv.innerHTML;
}
popup.appendChild(citeDiv);
});
return popup.innerHTML;
});
}
}
});
</script>
</div> <!-- /content -->
</body></html>