From bf0338bfed2c9157144f2caaa5bdab737fe03503 Mon Sep 17 00:00:00 2001
From: Quarto GHA Workflow Runner <quarto-github-actions-publish@example.com>
Date: Wed, 1 Apr 2026 17:36:13 +0000
Subject: [PATCH] Built site for gh-pages

---
 .nojekyll                  |    2 +-
 docs/config-reference.html | 2640 ++++++++++++++++++------------------
 search.json                |    2 +-
 sitemap.xml                |  470 +++----
 4 files changed, 1558 insertions(+), 1556 deletions(-)
diff --git a/.nojekyll b/.nojekyll
index be123c29e..df55b9fd0 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-5fc8e572
\ No newline at end of file
+b35dcba7
\ No newline at end of file
diff --git a/docs/config-reference.html b/docs/config-reference.html
index 3676db238..f4d4537d1 100644
--- a/docs/config-reference.html
+++ b/docs/config-reference.html
@@ -1076,1335 +1076,1337 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb1-292"><a href="#cb1-292" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to perform weighting in DPO trainer</span></span>
 <span id="cb1-293"><a href="#cb1-293" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_use_weighting</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-294"><a href="#cb1-294" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_label_smoothing</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-295"><a href="#cb1-295" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-296"><a href="#cb1-296" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use Liger kernel for DPO loss.</span></span>
-<span id="cb1-297"><a href="#cb1-297" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_use_liger_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-298"><a href="#cb1-298" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-299"><a href="#cb1-299" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_padding_free</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-295"><a href="#cb1-295" aria-hidden="true" tabindex="-1"></a><span class="co"># Precompute reference model log probabilities for DPO</span></span>
+<span id="cb1-296"><a href="#cb1-296" aria-hidden="true" tabindex="-1"></a><span class="fu">precompute_ref_log_probs</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-297"><a href="#cb1-297" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-298"><a href="#cb1-298" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use Liger kernel for DPO loss.</span></span>
+<span id="cb1-299"><a href="#cb1-299" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_use_liger_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-300"><a href="#cb1-300" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-301"><a href="#cb1-301" aria-hidden="true" tabindex="-1"></a><span class="co"># A list of one or more datasets to finetune the model with</span></span>
-<span id="cb1-302"><a href="#cb1-302" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span><span class="at"> Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None</span></span>
-<span id="cb1-303"><a href="#cb1-303" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SFTDataset:</span></span>
-<span id="cb1-304"><a href="#cb1-304" aria-hidden="true" tabindex="-1"></a><span class="co">  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory</span></span>
-<span id="cb1-305"><a href="#cb1-305" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-306"><a href="#cb1-306" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset split to load from</span></span>
-<span id="cb1-307"><a href="#cb1-307" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-308"><a href="#cb1-308" aria-hidden="true" tabindex="-1"></a><span class="co">  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]</span></span>
-<span id="cb1-309"><a href="#cb1-309" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> str | UserDefinedPrompterType | None</span></span>
-<span id="cb1-310"><a href="#cb1-310" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedPrompterType:</span></span>
-<span id="cb1-311"><a href="#cb1-311" aria-hidden="true" tabindex="-1"></a><span class="co">    # Custom user instruction prompt</span></span>
-<span id="cb1-312"><a href="#cb1-312" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-313"><a href="#cb1-313" aria-hidden="true" tabindex="-1"></a><span class="co">    # Use {system} as key to be replaced</span></span>
-<span id="cb1-314"><a href="#cb1-314" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-315"><a href="#cb1-315" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-316"><a href="#cb1-316" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_instruction</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-317"><a href="#cb1-317" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_input</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-318"><a href="#cb1-318" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_output</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-319"><a href="#cb1-319" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-320"><a href="#cb1-320" aria-hidden="true" tabindex="-1"></a><span class="co">    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to</span></span>
-<span id="cb1-321"><a href="#cb1-321" aria-hidden="true" tabindex="-1"></a><span class="co">    # be replaced. 'format' can include {input}</span></span>
-<span id="cb1-322"><a href="#cb1-322" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-323"><a href="#cb1-323" aria-hidden="true" tabindex="-1"></a><span class="co">    # 'no_input_format' cannot include {input}</span></span>
-<span id="cb1-324"><a href="#cb1-324" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">no_input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-325"><a href="#cb1-325" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_transform</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-326"><a href="#cb1-326" aria-hidden="true" tabindex="-1"></a><span class="co">  # split dataset into N pieces (use with shards_idx)</span></span>
-<span id="cb1-327"><a href="#cb1-327" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-328"><a href="#cb1-328" aria-hidden="true" tabindex="-1"></a><span class="co">  # the index of sharded dataset to use</span></span>
-<span id="cb1-329"><a href="#cb1-329" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards_idx</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-330"><a href="#cb1-330" aria-hidden="true" tabindex="-1"></a><span class="co">  # process dataset in N sequential chunks for memory efficiency (exclusive with</span></span>
-<span id="cb1-331"><a href="#cb1-331" aria-hidden="true" tabindex="-1"></a><span class="co">  # `shards`)</span></span>
-<span id="cb1-332"><a href="#cb1-332" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">preprocess_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-333"><a href="#cb1-333" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">conversation</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-334"><a href="#cb1-334" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-335"><a href="#cb1-335" aria-hidden="true" tabindex="-1"></a><span class="co">  # The name of the chat template to use for training, following values are supported:</span></span>
-<span id="cb1-336"><a href="#cb1-336" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default: Uses the chat template that is available in the</span></span>
-<span id="cb1-337"><a href="#cb1-337" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_config.json. If the chat template is not available in the tokenizer, it</span></span>
-<span id="cb1-338"><a href="#cb1-338" aria-hidden="true" tabindex="-1"></a><span class="co">  # will raise an error. This is the default.</span></span>
-<span id="cb1-339"><a href="#cb1-339" aria-hidden="true" tabindex="-1"></a><span class="co">  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
-<span id="cb1-340"><a href="#cb1-340" aria-hidden="true" tabindex="-1"></a><span class="co">  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
-<span id="cb1-341"><a href="#cb1-341" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback</span></span>
-<span id="cb1-342"><a href="#cb1-342" aria-hidden="true" tabindex="-1"></a><span class="co">  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.</span></span>
-<span id="cb1-343"><a href="#cb1-343" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat</span></span>
-<span id="cb1-344"><a href="#cb1-344" aria-hidden="true" tabindex="-1"></a><span class="co">  # template. The custom jinja template should be provided in the chat_template_jinja</span></span>
-<span id="cb1-345"><a href="#cb1-345" aria-hidden="true" tabindex="-1"></a><span class="co">  # field.</span></span>
-<span id="cb1-346"><a href="#cb1-346" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | str | None</span></span>
-<span id="cb1-347"><a href="#cb1-347" aria-hidden="true" tabindex="-1"></a><span class="co">  # Custom jinja chat template or path to jinja file. Used only if `chat_template:</span></span>
-<span id="cb1-348"><a href="#cb1-348" aria-hidden="true" tabindex="-1"></a><span class="co">  # jinja` or empty.</span></span>
-<span id="cb1-349"><a href="#cb1-349" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-350"><a href="#cb1-350" aria-hidden="true" tabindex="-1"></a><span class="co">  # path to source data files</span></span>
-<span id="cb1-351"><a href="#cb1-351" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
-<span id="cb1-352"><a href="#cb1-352" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-353"><a href="#cb1-353" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset configuration to load</span></span>
-<span id="cb1-354"><a href="#cb1-354" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-355"><a href="#cb1-355" aria-hidden="true" tabindex="-1"></a><span class="co">  # defines the datatype when path is a file</span></span>
-<span id="cb1-356"><a href="#cb1-356" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">ds_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-357"><a href="#cb1-357" aria-hidden="true" tabindex="-1"></a><span class="co">  # For `completion` datasets only, uses the provided field instead of `text` column</span></span>
-<span id="cb1-358"><a href="#cb1-358" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-359"><a href="#cb1-359" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_human</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-360"><a href="#cb1-360" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-361"><a href="#cb1-361" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the messages (default: "messages")</span></span>
-<span id="cb1-362"><a href="#cb1-362" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-363"><a href="#cb1-363" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON</span></span>
-<span id="cb1-364"><a href="#cb1-364" aria-hidden="true" tabindex="-1"></a><span class="co">  # schema](https://json-schema.org/learn/getting-started-step-by-step).</span></span>
-<span id="cb1-365"><a href="#cb1-365" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_tools</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-366"><a href="#cb1-366" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the reasoning trace (default: "reasoning_content").</span></span>
-<span id="cb1-367"><a href="#cb1-367" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_thinking</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-368"><a href="#cb1-368" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key the chat template expects that indicates the reasoning trace.</span></span>
-<span id="cb1-369"><a href="#cb1-369" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">template_thinking_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-370"><a href="#cb1-370" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-371"><a href="#cb1-371" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_role</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-301"><a href="#cb1-301" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_padding_free</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-302"><a href="#cb1-302" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-303"><a href="#cb1-303" aria-hidden="true" tabindex="-1"></a><span class="co"># A list of one or more datasets to finetune the model with</span></span>
+<span id="cb1-304"><a href="#cb1-304" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span><span class="at"> Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None</span></span>
+<span id="cb1-305"><a href="#cb1-305" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SFTDataset:</span></span>
+<span id="cb1-306"><a href="#cb1-306" aria-hidden="true" tabindex="-1"></a><span class="co">  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory</span></span>
+<span id="cb1-307"><a href="#cb1-307" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-308"><a href="#cb1-308" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset split to load from</span></span>
+<span id="cb1-309"><a href="#cb1-309" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-310"><a href="#cb1-310" aria-hidden="true" tabindex="-1"></a><span class="co">  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]</span></span>
+<span id="cb1-311"><a href="#cb1-311" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> str | UserDefinedPrompterType | None</span></span>
+<span id="cb1-312"><a href="#cb1-312" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedPrompterType:</span></span>
+<span id="cb1-313"><a href="#cb1-313" aria-hidden="true" tabindex="-1"></a><span class="co">    # Custom user instruction prompt</span></span>
+<span id="cb1-314"><a href="#cb1-314" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-315"><a href="#cb1-315" aria-hidden="true" tabindex="-1"></a><span class="co">    # Use {system} as key to be replaced</span></span>
+<span id="cb1-316"><a href="#cb1-316" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-317"><a href="#cb1-317" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-318"><a href="#cb1-318" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_instruction</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-319"><a href="#cb1-319" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_input</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-320"><a href="#cb1-320" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_output</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-321"><a href="#cb1-321" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-322"><a href="#cb1-322" aria-hidden="true" tabindex="-1"></a><span class="co">    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to</span></span>
+<span id="cb1-323"><a href="#cb1-323" aria-hidden="true" tabindex="-1"></a><span class="co">    # be replaced. 'format' can include {input}</span></span>
+<span id="cb1-324"><a href="#cb1-324" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-325"><a href="#cb1-325" aria-hidden="true" tabindex="-1"></a><span class="co">    # 'no_input_format' cannot include {input}</span></span>
+<span id="cb1-326"><a href="#cb1-326" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">no_input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-327"><a href="#cb1-327" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_transform</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-328"><a href="#cb1-328" aria-hidden="true" tabindex="-1"></a><span class="co">  # split dataset into N pieces (use with shards_idx)</span></span>
+<span id="cb1-329"><a href="#cb1-329" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-330"><a href="#cb1-330" aria-hidden="true" tabindex="-1"></a><span class="co">  # the index of sharded dataset to use</span></span>
+<span id="cb1-331"><a href="#cb1-331" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards_idx</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-332"><a href="#cb1-332" aria-hidden="true" tabindex="-1"></a><span class="co">  # process dataset in N sequential chunks for memory efficiency (exclusive with</span></span>
+<span id="cb1-333"><a href="#cb1-333" aria-hidden="true" tabindex="-1"></a><span class="co">  # `shards`)</span></span>
+<span id="cb1-334"><a href="#cb1-334" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">preprocess_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-335"><a href="#cb1-335" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">conversation</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-336"><a href="#cb1-336" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-337"><a href="#cb1-337" aria-hidden="true" tabindex="-1"></a><span class="co">  # The name of the chat template to use for training, following values are supported:</span></span>
+<span id="cb1-338"><a href="#cb1-338" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default: Uses the chat template that is available in the</span></span>
+<span id="cb1-339"><a href="#cb1-339" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_config.json. If the chat template is not available in the tokenizer, it</span></span>
+<span id="cb1-340"><a href="#cb1-340" aria-hidden="true" tabindex="-1"></a><span class="co">  # will raise an error. This is the default.</span></span>
+<span id="cb1-341"><a href="#cb1-341" aria-hidden="true" tabindex="-1"></a><span class="co">  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
+<span id="cb1-342"><a href="#cb1-342" aria-hidden="true" tabindex="-1"></a><span class="co">  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
+<span id="cb1-343"><a href="#cb1-343" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback</span></span>
+<span id="cb1-344"><a href="#cb1-344" aria-hidden="true" tabindex="-1"></a><span class="co">  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.</span></span>
+<span id="cb1-345"><a href="#cb1-345" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat</span></span>
+<span id="cb1-346"><a href="#cb1-346" aria-hidden="true" tabindex="-1"></a><span class="co">  # template. The custom jinja template should be provided in the chat_template_jinja</span></span>
+<span id="cb1-347"><a href="#cb1-347" aria-hidden="true" tabindex="-1"></a><span class="co">  # field.</span></span>
+<span id="cb1-348"><a href="#cb1-348" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | str | None</span></span>
+<span id="cb1-349"><a href="#cb1-349" aria-hidden="true" tabindex="-1"></a><span class="co">  # Custom jinja chat template or path to jinja file. Used only if `chat_template:</span></span>
+<span id="cb1-350"><a href="#cb1-350" aria-hidden="true" tabindex="-1"></a><span class="co">  # jinja` or empty.</span></span>
+<span id="cb1-351"><a href="#cb1-351" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-352"><a href="#cb1-352" aria-hidden="true" tabindex="-1"></a><span class="co">  # path to source data files</span></span>
+<span id="cb1-353"><a href="#cb1-353" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
+<span id="cb1-354"><a href="#cb1-354" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-355"><a href="#cb1-355" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset configuration to load</span></span>
+<span id="cb1-356"><a href="#cb1-356" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-357"><a href="#cb1-357" aria-hidden="true" tabindex="-1"></a><span class="co">  # defines the datatype when path is a file</span></span>
+<span id="cb1-358"><a href="#cb1-358" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">ds_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-359"><a href="#cb1-359" aria-hidden="true" tabindex="-1"></a><span class="co">  # For `completion` datasets only, uses the provided field instead of `text` column</span></span>
+<span id="cb1-360"><a href="#cb1-360" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-361"><a href="#cb1-361" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_human</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-362"><a href="#cb1-362" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-363"><a href="#cb1-363" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the messages (default: "messages")</span></span>
+<span id="cb1-364"><a href="#cb1-364" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-365"><a href="#cb1-365" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON</span></span>
+<span id="cb1-366"><a href="#cb1-366" aria-hidden="true" tabindex="-1"></a><span class="co">  # schema](https://json-schema.org/learn/getting-started-step-by-step).</span></span>
+<span id="cb1-367"><a href="#cb1-367" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_tools</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-368"><a href="#cb1-368" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the reasoning trace (default: "reasoning_content").</span></span>
+<span id="cb1-369"><a href="#cb1-369" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_thinking</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-370"><a href="#cb1-370" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key the chat template expects that indicates the reasoning trace.</span></span>
+<span id="cb1-371"><a href="#cb1-371" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">template_thinking_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
 <span id="cb1-372"><a href="#cb1-372" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-373"><a href="#cb1-373" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_content</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-374"><a href="#cb1-374" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mapping of properties from the input dataset to the chat template. (default:</span></span>
-<span id="cb1-375"><a href="#cb1-375" aria-hidden="true" tabindex="-1"></a><span class="co">  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists</span></span>
-<span id="cb1-376"><a href="#cb1-376" aria-hidden="true" tabindex="-1"></a><span class="co">  # in the template but not in this mapping, the system will attempt to load it directly</span></span>
-<span id="cb1-377"><a href="#cb1-377" aria-hidden="true" tabindex="-1"></a><span class="co">  # from the message using the property name as the key. Example: In the mapping below,</span></span>
-<span id="cb1-378"><a href="#cb1-378" aria-hidden="true" tabindex="-1"></a><span class="co">  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and</span></span>
-<span id="cb1-379"><a href="#cb1-379" aria-hidden="true" tabindex="-1"></a><span class="co">  # used as 'content' in the chat template.</span></span>
-<span id="cb1-380"><a href="#cb1-380" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_property_mappings</span><span class="kw">:</span><span class="at"> dict[str, str] | None</span></span>
-<span id="cb1-381"><a href="#cb1-381" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that indicates via boolean whether tokens of a turn</span></span>
-<span id="cb1-382"><a href="#cb1-382" aria-hidden="true" tabindex="-1"></a><span class="co">  # should be considered for training. Useful to selectively train on certain turns</span></span>
-<span id="cb1-383"><a href="#cb1-383" aria-hidden="true" tabindex="-1"></a><span class="co">  # besides the `roles_to_train`.</span></span>
-<span id="cb1-384"><a href="#cb1-384" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-385"><a href="#cb1-385" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that contains the training details. Useful to</span></span>
-<span id="cb1-386"><a href="#cb1-386" aria-hidden="true" tabindex="-1"></a><span class="co">  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]</span></span>
-<span id="cb1-387"><a href="#cb1-387" aria-hidden="true" tabindex="-1"></a><span class="co">  # containing `begin_offset` (start character index in content), `end_offset` (end</span></span>
-<span id="cb1-388"><a href="#cb1-388" aria-hidden="true" tabindex="-1"></a><span class="co">  # character index in content), and `train` (boolean whether to train).</span></span>
-<span id="cb1-389"><a href="#cb1-389" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training_detail</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-390"><a href="#cb1-390" aria-hidden="true" tabindex="-1"></a><span class="co">  # (for Qwen3 template only) Whether to split the assistant content based on a</span></span>
-<span id="cb1-391"><a href="#cb1-391" aria-hidden="true" tabindex="-1"></a><span class="co">  # reasoning trace inside delimited tags</span></span>
-<span id="cb1-392"><a href="#cb1-392" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split_thinking</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-393"><a href="#cb1-393" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">logprobs_field</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-394"><a href="#cb1-394" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-395"><a href="#cb1-395" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles to train on. The tokens from these roles will be considered for the loss.</span></span>
-<span id="cb1-396"><a href="#cb1-396" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles_to_train</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-397"><a href="#cb1-397" aria-hidden="true" tabindex="-1"></a><span class="co">  # Which EOS tokens to train on in the conversation. Possible values are: all: train on</span></span>
-<span id="cb1-398"><a href="#cb1-398" aria-hidden="true" tabindex="-1"></a><span class="co">  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable</span></span>
-<span id="cb1-399"><a href="#cb1-399" aria-hidden="true" tabindex="-1"></a><span class="co">  # turn, last: train on the last EOS token in the conversation</span></span>
-<span id="cb1-400"><a href="#cb1-400" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">train_on_eos</span><span class="kw">:</span><span class="at"> Literal['all', 'turn', 'last'] | None</span></span>
-<span id="cb1-401"><a href="#cb1-401" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All</span></span>
-<span id="cb1-402"><a href="#cb1-402" aria-hidden="true" tabindex="-1"></a><span class="co">  # source roles will be mapped to the target role. The default is: user: ["human",</span></span>
-<span id="cb1-403"><a href="#cb1-403" aria-hidden="true" tabindex="-1"></a><span class="co">  # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]</span></span>
-<span id="cb1-404"><a href="#cb1-404" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles</span><span class="kw">:</span><span class="at"> dict[str, list[str]] | None</span></span>
-<span id="cb1-405"><a href="#cb1-405" aria-hidden="true" tabindex="-1"></a><span class="co">  # Whether to drop the system turn from the dataset. Only works with chat_template.</span></span>
-<span id="cb1-406"><a href="#cb1-406" aria-hidden="true" tabindex="-1"></a><span class="co">  # This does not drop the default system message from chat_template if it exists. If</span></span>
-<span id="cb1-407"><a href="#cb1-407" aria-hidden="true" tabindex="-1"></a><span class="co">  # you wish to, we recommend using a custom jinja template with the default system</span></span>
-<span id="cb1-408"><a href="#cb1-408" aria-hidden="true" tabindex="-1"></a><span class="co">  # message removed or adding a system turn with empty content.</span></span>
-<span id="cb1-409"><a href="#cb1-409" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">drop_system_message</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-410"><a href="#cb1-410" aria-hidden="true" tabindex="-1"></a><span class="co">  # Trust remote code for untrusted source</span></span>
-<span id="cb1-411"><a href="#cb1-411" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-412"><a href="#cb1-412" aria-hidden="true" tabindex="-1"></a><span class="co">  # The specific revision of the dataset to use when loading from the Hugging Face Hub.</span></span>
-<span id="cb1-413"><a href="#cb1-413" aria-hidden="true" tabindex="-1"></a><span class="co">  # This can be a commit hash, tag, or branch name. If not specified, the latest version</span></span>
-<span id="cb1-414"><a href="#cb1-414" aria-hidden="true" tabindex="-1"></a><span class="co">  # will be used. This parameter is ignored for local datasets.</span></span>
-<span id="cb1-415"><a href="#cb1-415" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-416"><a href="#cb1-416" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-417"><a href="#cb1-417" aria-hidden="true" tabindex="-1"></a><span class="co">  # For DPODataset:</span></span>
-<span id="cb1-418"><a href="#cb1-418" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-419"><a href="#cb1-419" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-420"><a href="#cb1-420" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> UserDefinedDPOType | str | None</span></span>
-<span id="cb1-421"><a href="#cb1-421" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedDPOType:</span></span>
-<span id="cb1-422"><a href="#cb1-422" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-423"><a href="#cb1-423" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-424"><a href="#cb1-424" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_chosen</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-425"><a href="#cb1-425" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_rejected</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-426"><a href="#cb1-426" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">prompt_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-427"><a href="#cb1-427" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">chosen_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-428"><a href="#cb1-428" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">rejected_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-429"><a href="#cb1-429" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-430"><a href="#cb1-430" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-431"><a href="#cb1-431" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-432"><a href="#cb1-432" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-433"><a href="#cb1-433" aria-hidden="true" tabindex="-1"></a><span class="co">  # For KTODataset:</span></span>
-<span id="cb1-434"><a href="#cb1-434" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-435"><a href="#cb1-435" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-436"><a href="#cb1-436" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> UserDefinedKTOType | str | None</span></span>
-<span id="cb1-437"><a href="#cb1-437" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedKTOType:</span></span>
-<span id="cb1-438"><a href="#cb1-438" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-439"><a href="#cb1-439" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-440"><a href="#cb1-440" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_completion</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-441"><a href="#cb1-441" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_label</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-442"><a href="#cb1-442" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">prompt_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-443"><a href="#cb1-443" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">completion_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-444"><a href="#cb1-444" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-445"><a href="#cb1-445" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-446"><a href="#cb1-446" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-447"><a href="#cb1-447" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-448"><a href="#cb1-448" aria-hidden="true" tabindex="-1"></a><span class="co">  # For StepwiseSupervisedDataset:</span></span>
-<span id="cb1-449"><a href="#cb1-449" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-450"><a href="#cb1-450" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-451"><a href="#cb1-451" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-452"><a href="#cb1-452" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-453"><a href="#cb1-453" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">step_separator</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-454"><a href="#cb1-454" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">max_completion_length</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-455"><a href="#cb1-455" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">train_on_last_step_only</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-456"><a href="#cb1-456" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-457"><a href="#cb1-457" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SyntheticDataset:</span></span>
-<span id="cb1-458"><a href="#cb1-458" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> Literal['synthetic'] = synthetic</span></span>
-<span id="cb1-459"><a href="#cb1-459" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> Literal['_synthetic'] = _synthetic</span></span>
-<span id="cb1-460"><a href="#cb1-460" aria-hidden="true" tabindex="-1"></a><span class="co">  # Number of rows to generate</span></span>
-<span id="cb1-461"><a href="#cb1-461" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">length</span><span class="kw">:</span><span class="at"> int = 1000</span></span>
-<span id="cb1-462"><a href="#cb1-462" aria-hidden="true" tabindex="-1"></a><span class="co">  # Sequence length per row (defaults to sequence_len from config)</span></span>
-<span id="cb1-463"><a href="#cb1-463" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">sequence_length</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-464"><a href="#cb1-464" aria-hidden="true" tabindex="-1"></a><span class="co">  # Minimum token ID for generation</span></span>
-<span id="cb1-465"><a href="#cb1-465" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">min_input_id</span><span class="kw">:</span><span class="at"> int = 100</span></span>
-<span id="cb1-466"><a href="#cb1-466" aria-hidden="true" tabindex="-1"></a><span class="co">  # Maximum token ID for generation (defaults to tokenizer vocab_size)</span></span>
-<span id="cb1-467"><a href="#cb1-467" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">max_input_id</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-468"><a href="#cb1-468" aria-hidden="true" tabindex="-1"></a><span class="co">  # Random seed for reproducibility</span></span>
-<span id="cb1-469"><a href="#cb1-469" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">seed</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-470"><a href="#cb1-470" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-471"><a href="#cb1-471" aria-hidden="true" tabindex="-1"></a><span class="co"># A list of one or more datasets to eval the model with. You can use either</span></span>
-<span id="cb1-472"><a href="#cb1-472" aria-hidden="true" tabindex="-1"></a><span class="co"># test_datasets, or val_set_size, but not both.</span></span>
-<span id="cb1-473"><a href="#cb1-473" aria-hidden="true" tabindex="-1"></a><span class="fu">test_datasets</span><span class="kw">:</span><span class="at"> Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None</span></span>
-<span id="cb1-474"><a href="#cb1-474" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SFTDataset:</span></span>
-<span id="cb1-475"><a href="#cb1-475" aria-hidden="true" tabindex="-1"></a><span class="co">  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory</span></span>
-<span id="cb1-476"><a href="#cb1-476" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-477"><a href="#cb1-477" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset split to load from</span></span>
-<span id="cb1-478"><a href="#cb1-478" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-479"><a href="#cb1-479" aria-hidden="true" tabindex="-1"></a><span class="co">  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]</span></span>
-<span id="cb1-480"><a href="#cb1-480" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> str | UserDefinedPrompterType | None</span></span>
-<span id="cb1-481"><a href="#cb1-481" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedPrompterType:</span></span>
-<span id="cb1-482"><a href="#cb1-482" aria-hidden="true" tabindex="-1"></a><span class="co">    # Custom user instruction prompt</span></span>
-<span id="cb1-483"><a href="#cb1-483" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-484"><a href="#cb1-484" aria-hidden="true" tabindex="-1"></a><span class="co">    # Use {system} as key to be replaced</span></span>
-<span id="cb1-485"><a href="#cb1-485" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-486"><a href="#cb1-486" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-487"><a href="#cb1-487" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_instruction</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-488"><a href="#cb1-488" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_input</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-489"><a href="#cb1-489" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_output</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-490"><a href="#cb1-490" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-491"><a href="#cb1-491" aria-hidden="true" tabindex="-1"></a><span class="co">    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to</span></span>
-<span id="cb1-492"><a href="#cb1-492" aria-hidden="true" tabindex="-1"></a><span class="co">    # be replaced. 'format' can include {input}</span></span>
-<span id="cb1-493"><a href="#cb1-493" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-494"><a href="#cb1-494" aria-hidden="true" tabindex="-1"></a><span class="co">    # 'no_input_format' cannot include {input}</span></span>
-<span id="cb1-495"><a href="#cb1-495" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">no_input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-496"><a href="#cb1-496" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_transform</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-497"><a href="#cb1-497" aria-hidden="true" tabindex="-1"></a><span class="co">  # split dataset into N pieces (use with shards_idx)</span></span>
-<span id="cb1-498"><a href="#cb1-498" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-499"><a href="#cb1-499" aria-hidden="true" tabindex="-1"></a><span class="co">  # the index of sharded dataset to use</span></span>
-<span id="cb1-500"><a href="#cb1-500" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards_idx</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-501"><a href="#cb1-501" aria-hidden="true" tabindex="-1"></a><span class="co">  # process dataset in N sequential chunks for memory efficiency (exclusive with</span></span>
-<span id="cb1-502"><a href="#cb1-502" aria-hidden="true" tabindex="-1"></a><span class="co">  # `shards`)</span></span>
-<span id="cb1-503"><a href="#cb1-503" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">preprocess_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-504"><a href="#cb1-504" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">conversation</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-505"><a href="#cb1-505" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-506"><a href="#cb1-506" aria-hidden="true" tabindex="-1"></a><span class="co">  # The name of the chat template to use for training, following values are supported:</span></span>
-<span id="cb1-507"><a href="#cb1-507" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default: Uses the chat template that is available in the</span></span>
-<span id="cb1-508"><a href="#cb1-508" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_config.json. If the chat template is not available in the tokenizer, it</span></span>
-<span id="cb1-509"><a href="#cb1-509" aria-hidden="true" tabindex="-1"></a><span class="co">  # will raise an error. This is the default.</span></span>
-<span id="cb1-510"><a href="#cb1-510" aria-hidden="true" tabindex="-1"></a><span class="co">  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
-<span id="cb1-511"><a href="#cb1-511" aria-hidden="true" tabindex="-1"></a><span class="co">  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
-<span id="cb1-512"><a href="#cb1-512" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback</span></span>
-<span id="cb1-513"><a href="#cb1-513" aria-hidden="true" tabindex="-1"></a><span class="co">  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.</span></span>
-<span id="cb1-514"><a href="#cb1-514" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat</span></span>
-<span id="cb1-515"><a href="#cb1-515" aria-hidden="true" tabindex="-1"></a><span class="co">  # template. The custom jinja template should be provided in the chat_template_jinja</span></span>
-<span id="cb1-516"><a href="#cb1-516" aria-hidden="true" tabindex="-1"></a><span class="co">  # field.</span></span>
-<span id="cb1-517"><a href="#cb1-517" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | str | None</span></span>
-<span id="cb1-518"><a href="#cb1-518" aria-hidden="true" tabindex="-1"></a><span class="co">  # Custom jinja chat template or path to jinja file. Used only if `chat_template:</span></span>
-<span id="cb1-519"><a href="#cb1-519" aria-hidden="true" tabindex="-1"></a><span class="co">  # jinja` or empty.</span></span>
-<span id="cb1-520"><a href="#cb1-520" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-521"><a href="#cb1-521" aria-hidden="true" tabindex="-1"></a><span class="co">  # path to source data files</span></span>
-<span id="cb1-522"><a href="#cb1-522" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
-<span id="cb1-523"><a href="#cb1-523" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-524"><a href="#cb1-524" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset configuration to load</span></span>
-<span id="cb1-525"><a href="#cb1-525" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-526"><a href="#cb1-526" aria-hidden="true" tabindex="-1"></a><span class="co">  # defines the datatype when path is a file</span></span>
-<span id="cb1-527"><a href="#cb1-527" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">ds_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-528"><a href="#cb1-528" aria-hidden="true" tabindex="-1"></a><span class="co">  # For `completion` datasets only, uses the provided field instead of `text` column</span></span>
-<span id="cb1-529"><a href="#cb1-529" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-530"><a href="#cb1-530" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_human</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-531"><a href="#cb1-531" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-532"><a href="#cb1-532" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the messages (default: "messages")</span></span>
-<span id="cb1-533"><a href="#cb1-533" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-534"><a href="#cb1-534" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON</span></span>
-<span id="cb1-535"><a href="#cb1-535" aria-hidden="true" tabindex="-1"></a><span class="co">  # schema](https://json-schema.org/learn/getting-started-step-by-step).</span></span>
-<span id="cb1-536"><a href="#cb1-536" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_tools</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-537"><a href="#cb1-537" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the reasoning trace (default: "reasoning_content").</span></span>
-<span id="cb1-538"><a href="#cb1-538" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_thinking</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-539"><a href="#cb1-539" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key the chat template expects that indicates the reasoning trace.</span></span>
-<span id="cb1-540"><a href="#cb1-540" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">template_thinking_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-541"><a href="#cb1-541" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-542"><a href="#cb1-542" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_role</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-373"><a href="#cb1-373" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_role</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-374"><a href="#cb1-374" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-375"><a href="#cb1-375" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_content</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-376"><a href="#cb1-376" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mapping of properties from the input dataset to the chat template. (default:</span></span>
+<span id="cb1-377"><a href="#cb1-377" aria-hidden="true" tabindex="-1"></a><span class="co">  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists</span></span>
+<span id="cb1-378"><a href="#cb1-378" aria-hidden="true" tabindex="-1"></a><span class="co">  # in the template but not in this mapping, the system will attempt to load it directly</span></span>
+<span id="cb1-379"><a href="#cb1-379" aria-hidden="true" tabindex="-1"></a><span class="co">  # from the message using the property name as the key. Example: In the mapping below,</span></span>
+<span id="cb1-380"><a href="#cb1-380" aria-hidden="true" tabindex="-1"></a><span class="co">  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and</span></span>
+<span id="cb1-381"><a href="#cb1-381" aria-hidden="true" tabindex="-1"></a><span class="co">  # used as 'content' in the chat template.</span></span>
+<span id="cb1-382"><a href="#cb1-382" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_property_mappings</span><span class="kw">:</span><span class="at"> dict[str, str] | None</span></span>
+<span id="cb1-383"><a href="#cb1-383" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that indicates via boolean whether tokens of a turn</span></span>
+<span id="cb1-384"><a href="#cb1-384" aria-hidden="true" tabindex="-1"></a><span class="co">  # should be considered for training. Useful to selectively train on certain turns</span></span>
+<span id="cb1-385"><a href="#cb1-385" aria-hidden="true" tabindex="-1"></a><span class="co">  # besides the `roles_to_train`.</span></span>
+<span id="cb1-386"><a href="#cb1-386" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-387"><a href="#cb1-387" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that contains the training details. Useful to</span></span>
+<span id="cb1-388"><a href="#cb1-388" aria-hidden="true" tabindex="-1"></a><span class="co">  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]</span></span>
+<span id="cb1-389"><a href="#cb1-389" aria-hidden="true" tabindex="-1"></a><span class="co">  # containing `begin_offset` (start character index in content), `end_offset` (end</span></span>
+<span id="cb1-390"><a href="#cb1-390" aria-hidden="true" tabindex="-1"></a><span class="co">  # character index in content), and `train` (boolean whether to train).</span></span>
+<span id="cb1-391"><a href="#cb1-391" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training_detail</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-392"><a href="#cb1-392" aria-hidden="true" tabindex="-1"></a><span class="co">  # (for Qwen3 template only) Whether to split the assistant content based on a</span></span>
+<span id="cb1-393"><a href="#cb1-393" aria-hidden="true" tabindex="-1"></a><span class="co">  # reasoning trace inside delimited tags</span></span>
+<span id="cb1-394"><a href="#cb1-394" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split_thinking</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-395"><a href="#cb1-395" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">logprobs_field</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-396"><a href="#cb1-396" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-397"><a href="#cb1-397" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles to train on. The tokens from these roles will be considered for the loss.</span></span>
+<span id="cb1-398"><a href="#cb1-398" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles_to_train</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-399"><a href="#cb1-399" aria-hidden="true" tabindex="-1"></a><span class="co">  # Which EOS tokens to train on in the conversation. Possible values are: all: train on</span></span>
+<span id="cb1-400"><a href="#cb1-400" aria-hidden="true" tabindex="-1"></a><span class="co">  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable</span></span>
+<span id="cb1-401"><a href="#cb1-401" aria-hidden="true" tabindex="-1"></a><span class="co">  # turn, last: train on the last EOS token in the conversation</span></span>
+<span id="cb1-402"><a href="#cb1-402" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">train_on_eos</span><span class="kw">:</span><span class="at"> Literal['all', 'turn', 'last'] | None</span></span>
+<span id="cb1-403"><a href="#cb1-403" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All</span></span>
+<span id="cb1-404"><a href="#cb1-404" aria-hidden="true" tabindex="-1"></a><span class="co">  # source roles will be mapped to the target role. The default is: user: ["human",</span></span>
+<span id="cb1-405"><a href="#cb1-405" aria-hidden="true" tabindex="-1"></a><span class="co">  # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]</span></span>
+<span id="cb1-406"><a href="#cb1-406" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles</span><span class="kw">:</span><span class="at"> dict[str, list[str]] | None</span></span>
+<span id="cb1-407"><a href="#cb1-407" aria-hidden="true" tabindex="-1"></a><span class="co">  # Whether to drop the system turn from the dataset. Only works with chat_template.</span></span>
+<span id="cb1-408"><a href="#cb1-408" aria-hidden="true" tabindex="-1"></a><span class="co">  # This does not drop the default system message from chat_template if it exists. If</span></span>
+<span id="cb1-409"><a href="#cb1-409" aria-hidden="true" tabindex="-1"></a><span class="co">  # you wish to, we recommend using a custom jinja template with the default system</span></span>
+<span id="cb1-410"><a href="#cb1-410" aria-hidden="true" tabindex="-1"></a><span class="co">  # message removed or adding a system turn with empty content.</span></span>
+<span id="cb1-411"><a href="#cb1-411" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">drop_system_message</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-412"><a href="#cb1-412" aria-hidden="true" tabindex="-1"></a><span class="co">  # Trust remote code for untrusted source</span></span>
+<span id="cb1-413"><a href="#cb1-413" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-414"><a href="#cb1-414" aria-hidden="true" tabindex="-1"></a><span class="co">  # The specific revision of the dataset to use when loading from the Hugging Face Hub.</span></span>
+<span id="cb1-415"><a href="#cb1-415" aria-hidden="true" tabindex="-1"></a><span class="co">  # This can be a commit hash, tag, or branch name. If not specified, the latest version</span></span>
+<span id="cb1-416"><a href="#cb1-416" aria-hidden="true" tabindex="-1"></a><span class="co">  # will be used. This parameter is ignored for local datasets.</span></span>
+<span id="cb1-417"><a href="#cb1-417" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-418"><a href="#cb1-418" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-419"><a href="#cb1-419" aria-hidden="true" tabindex="-1"></a><span class="co">  # For DPODataset:</span></span>
+<span id="cb1-420"><a href="#cb1-420" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-421"><a href="#cb1-421" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-422"><a href="#cb1-422" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> UserDefinedDPOType | str | None</span></span>
+<span id="cb1-423"><a href="#cb1-423" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedDPOType:</span></span>
+<span id="cb1-424"><a href="#cb1-424" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-425"><a href="#cb1-425" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-426"><a href="#cb1-426" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_chosen</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-427"><a href="#cb1-427" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_rejected</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-428"><a href="#cb1-428" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">prompt_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-429"><a href="#cb1-429" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">chosen_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-430"><a href="#cb1-430" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">rejected_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-431"><a href="#cb1-431" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-432"><a href="#cb1-432" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-433"><a href="#cb1-433" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-434"><a href="#cb1-434" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-435"><a href="#cb1-435" aria-hidden="true" tabindex="-1"></a><span class="co">  # For KTODataset:</span></span>
+<span id="cb1-436"><a href="#cb1-436" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-437"><a href="#cb1-437" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-438"><a href="#cb1-438" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> UserDefinedKTOType | str | None</span></span>
+<span id="cb1-439"><a href="#cb1-439" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedKTOType:</span></span>
+<span id="cb1-440"><a href="#cb1-440" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-441"><a href="#cb1-441" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-442"><a href="#cb1-442" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_completion</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-443"><a href="#cb1-443" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_label</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-444"><a href="#cb1-444" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">prompt_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-445"><a href="#cb1-445" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">completion_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-446"><a href="#cb1-446" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-447"><a href="#cb1-447" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-448"><a href="#cb1-448" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-449"><a href="#cb1-449" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-450"><a href="#cb1-450" aria-hidden="true" tabindex="-1"></a><span class="co">  # For StepwiseSupervisedDataset:</span></span>
+<span id="cb1-451"><a href="#cb1-451" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-452"><a href="#cb1-452" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-453"><a href="#cb1-453" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-454"><a href="#cb1-454" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-455"><a href="#cb1-455" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">step_separator</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-456"><a href="#cb1-456" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">max_completion_length</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-457"><a href="#cb1-457" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">train_on_last_step_only</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-458"><a href="#cb1-458" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-459"><a href="#cb1-459" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SyntheticDataset:</span></span>
+<span id="cb1-460"><a href="#cb1-460" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> Literal['synthetic'] = synthetic</span></span>
+<span id="cb1-461"><a href="#cb1-461" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> Literal['_synthetic'] = _synthetic</span></span>
+<span id="cb1-462"><a href="#cb1-462" aria-hidden="true" tabindex="-1"></a><span class="co">  # Number of rows to generate</span></span>
+<span id="cb1-463"><a href="#cb1-463" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">length</span><span class="kw">:</span><span class="at"> int = 1000</span></span>
+<span id="cb1-464"><a href="#cb1-464" aria-hidden="true" tabindex="-1"></a><span class="co">  # Sequence length per row (defaults to sequence_len from config)</span></span>
+<span id="cb1-465"><a href="#cb1-465" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">sequence_length</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-466"><a href="#cb1-466" aria-hidden="true" tabindex="-1"></a><span class="co">  # Minimum token ID for generation</span></span>
+<span id="cb1-467"><a href="#cb1-467" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">min_input_id</span><span class="kw">:</span><span class="at"> int = 100</span></span>
+<span id="cb1-468"><a href="#cb1-468" aria-hidden="true" tabindex="-1"></a><span class="co">  # Maximum token ID for generation (defaults to tokenizer vocab_size)</span></span>
+<span id="cb1-469"><a href="#cb1-469" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">max_input_id</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-470"><a href="#cb1-470" aria-hidden="true" tabindex="-1"></a><span class="co">  # Random seed for reproducibility</span></span>
+<span id="cb1-471"><a href="#cb1-471" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">seed</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-472"><a href="#cb1-472" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-473"><a href="#cb1-473" aria-hidden="true" tabindex="-1"></a><span class="co"># A list of one or more datasets to eval the model with. You can use either</span></span>
+<span id="cb1-474"><a href="#cb1-474" aria-hidden="true" tabindex="-1"></a><span class="co"># test_datasets, or val_set_size, but not both.</span></span>
+<span id="cb1-475"><a href="#cb1-475" aria-hidden="true" tabindex="-1"></a><span class="fu">test_datasets</span><span class="kw">:</span><span class="at"> Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None</span></span>
+<span id="cb1-476"><a href="#cb1-476" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SFTDataset:</span></span>
+<span id="cb1-477"><a href="#cb1-477" aria-hidden="true" tabindex="-1"></a><span class="co">  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory</span></span>
+<span id="cb1-478"><a href="#cb1-478" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-479"><a href="#cb1-479" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset split to load from</span></span>
+<span id="cb1-480"><a href="#cb1-480" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-481"><a href="#cb1-481" aria-hidden="true" tabindex="-1"></a><span class="co">  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]</span></span>
+<span id="cb1-482"><a href="#cb1-482" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> str | UserDefinedPrompterType | None</span></span>
+<span id="cb1-483"><a href="#cb1-483" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedPrompterType:</span></span>
+<span id="cb1-484"><a href="#cb1-484" aria-hidden="true" tabindex="-1"></a><span class="co">    # Custom user instruction prompt</span></span>
+<span id="cb1-485"><a href="#cb1-485" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-486"><a href="#cb1-486" aria-hidden="true" tabindex="-1"></a><span class="co">    # Use {system} as key to be replaced</span></span>
+<span id="cb1-487"><a href="#cb1-487" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-488"><a href="#cb1-488" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-489"><a href="#cb1-489" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_instruction</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-490"><a href="#cb1-490" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_input</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-491"><a href="#cb1-491" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_output</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-492"><a href="#cb1-492" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-493"><a href="#cb1-493" aria-hidden="true" tabindex="-1"></a><span class="co">    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to</span></span>
+<span id="cb1-494"><a href="#cb1-494" aria-hidden="true" tabindex="-1"></a><span class="co">    # be replaced. 'format' can include {input}</span></span>
+<span id="cb1-495"><a href="#cb1-495" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-496"><a href="#cb1-496" aria-hidden="true" tabindex="-1"></a><span class="co">    # 'no_input_format' cannot include {input}</span></span>
+<span id="cb1-497"><a href="#cb1-497" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">no_input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-498"><a href="#cb1-498" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_transform</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-499"><a href="#cb1-499" aria-hidden="true" tabindex="-1"></a><span class="co">  # split dataset into N pieces (use with shards_idx)</span></span>
+<span id="cb1-500"><a href="#cb1-500" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-501"><a href="#cb1-501" aria-hidden="true" tabindex="-1"></a><span class="co">  # the index of sharded dataset to use</span></span>
+<span id="cb1-502"><a href="#cb1-502" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards_idx</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-503"><a href="#cb1-503" aria-hidden="true" tabindex="-1"></a><span class="co">  # process dataset in N sequential chunks for memory efficiency (exclusive with</span></span>
+<span id="cb1-504"><a href="#cb1-504" aria-hidden="true" tabindex="-1"></a><span class="co">  # `shards`)</span></span>
+<span id="cb1-505"><a href="#cb1-505" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">preprocess_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-506"><a href="#cb1-506" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">conversation</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-507"><a href="#cb1-507" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-508"><a href="#cb1-508" aria-hidden="true" tabindex="-1"></a><span class="co">  # The name of the chat template to use for training, following values are supported:</span></span>
+<span id="cb1-509"><a href="#cb1-509" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default: Uses the chat template that is available in the</span></span>
+<span id="cb1-510"><a href="#cb1-510" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_config.json. If the chat template is not available in the tokenizer, it</span></span>
+<span id="cb1-511"><a href="#cb1-511" aria-hidden="true" tabindex="-1"></a><span class="co">  # will raise an error. This is the default.</span></span>
+<span id="cb1-512"><a href="#cb1-512" aria-hidden="true" tabindex="-1"></a><span class="co">  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
+<span id="cb1-513"><a href="#cb1-513" aria-hidden="true" tabindex="-1"></a><span class="co">  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
+<span id="cb1-514"><a href="#cb1-514" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback</span></span>
+<span id="cb1-515"><a href="#cb1-515" aria-hidden="true" tabindex="-1"></a><span class="co">  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.</span></span>
+<span id="cb1-516"><a href="#cb1-516" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat</span></span>
+<span id="cb1-517"><a href="#cb1-517" aria-hidden="true" tabindex="-1"></a><span class="co">  # template. The custom jinja template should be provided in the chat_template_jinja</span></span>
+<span id="cb1-518"><a href="#cb1-518" aria-hidden="true" tabindex="-1"></a><span class="co">  # field.</span></span>
+<span id="cb1-519"><a href="#cb1-519" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | str | None</span></span>
+<span id="cb1-520"><a href="#cb1-520" aria-hidden="true" tabindex="-1"></a><span class="co">  # Custom jinja chat template or path to jinja file. Used only if `chat_template:</span></span>
+<span id="cb1-521"><a href="#cb1-521" aria-hidden="true" tabindex="-1"></a><span class="co">  # jinja` or empty.</span></span>
+<span id="cb1-522"><a href="#cb1-522" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-523"><a href="#cb1-523" aria-hidden="true" tabindex="-1"></a><span class="co">  # path to source data files</span></span>
+<span id="cb1-524"><a href="#cb1-524" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
+<span id="cb1-525"><a href="#cb1-525" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-526"><a href="#cb1-526" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset configuration to load</span></span>
+<span id="cb1-527"><a href="#cb1-527" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-528"><a href="#cb1-528" aria-hidden="true" tabindex="-1"></a><span class="co">  # defines the datatype when path is a file</span></span>
+<span id="cb1-529"><a href="#cb1-529" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">ds_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-530"><a href="#cb1-530" aria-hidden="true" tabindex="-1"></a><span class="co">  # For `completion` datasets only, uses the provided field instead of `text` column</span></span>
+<span id="cb1-531"><a href="#cb1-531" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-532"><a href="#cb1-532" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_human</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-533"><a href="#cb1-533" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-534"><a href="#cb1-534" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the messages (default: "messages")</span></span>
+<span id="cb1-535"><a href="#cb1-535" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-536"><a href="#cb1-536" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON</span></span>
+<span id="cb1-537"><a href="#cb1-537" aria-hidden="true" tabindex="-1"></a><span class="co">  # schema](https://json-schema.org/learn/getting-started-step-by-step).</span></span>
+<span id="cb1-538"><a href="#cb1-538" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_tools</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-539"><a href="#cb1-539" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the reasoning trace (default: "reasoning_content").</span></span>
+<span id="cb1-540"><a href="#cb1-540" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_thinking</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-541"><a href="#cb1-541" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key the chat template expects that indicates the reasoning trace.</span></span>
+<span id="cb1-542"><a href="#cb1-542" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">template_thinking_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
 <span id="cb1-543"><a href="#cb1-543" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-544"><a href="#cb1-544" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_content</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-545"><a href="#cb1-545" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mapping of properties from the input dataset to the chat template. (default:</span></span>
-<span id="cb1-546"><a href="#cb1-546" aria-hidden="true" tabindex="-1"></a><span class="co">  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists</span></span>
-<span id="cb1-547"><a href="#cb1-547" aria-hidden="true" tabindex="-1"></a><span class="co">  # in the template but not in this mapping, the system will attempt to load it directly</span></span>
-<span id="cb1-548"><a href="#cb1-548" aria-hidden="true" tabindex="-1"></a><span class="co">  # from the message using the property name as the key. Example: In the mapping below,</span></span>
-<span id="cb1-549"><a href="#cb1-549" aria-hidden="true" tabindex="-1"></a><span class="co">  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and</span></span>
-<span id="cb1-550"><a href="#cb1-550" aria-hidden="true" tabindex="-1"></a><span class="co">  # used as 'content' in the chat template.</span></span>
-<span id="cb1-551"><a href="#cb1-551" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_property_mappings</span><span class="kw">:</span><span class="at"> dict[str, str] | None</span></span>
-<span id="cb1-552"><a href="#cb1-552" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that indicates via boolean whether tokens of a turn</span></span>
-<span id="cb1-553"><a href="#cb1-553" aria-hidden="true" tabindex="-1"></a><span class="co">  # should be considered for training. Useful to selectively train on certain turns</span></span>
-<span id="cb1-554"><a href="#cb1-554" aria-hidden="true" tabindex="-1"></a><span class="co">  # besides the `roles_to_train`.</span></span>
-<span id="cb1-555"><a href="#cb1-555" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-556"><a href="#cb1-556" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that contains the training details. Useful to</span></span>
-<span id="cb1-557"><a href="#cb1-557" aria-hidden="true" tabindex="-1"></a><span class="co">  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]</span></span>
-<span id="cb1-558"><a href="#cb1-558" aria-hidden="true" tabindex="-1"></a><span class="co">  # containing `begin_offset` (start character index in content), `end_offset` (end</span></span>
-<span id="cb1-559"><a href="#cb1-559" aria-hidden="true" tabindex="-1"></a><span class="co">  # character index in content), and `train` (boolean whether to train).</span></span>
-<span id="cb1-560"><a href="#cb1-560" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training_detail</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-561"><a href="#cb1-561" aria-hidden="true" tabindex="-1"></a><span class="co">  # (for Qwen3 template only) Whether to split the assistant content based on a</span></span>
-<span id="cb1-562"><a href="#cb1-562" aria-hidden="true" tabindex="-1"></a><span class="co">  # reasoning trace inside delimited tags</span></span>
-<span id="cb1-563"><a href="#cb1-563" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split_thinking</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-564"><a href="#cb1-564" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">logprobs_field</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-565"><a href="#cb1-565" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-566"><a href="#cb1-566" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles to train on. The tokens from these roles will be considered for the loss.</span></span>
-<span id="cb1-567"><a href="#cb1-567" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles_to_train</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-568"><a href="#cb1-568" aria-hidden="true" tabindex="-1"></a><span class="co">  # Which EOS tokens to train on in the conversation. Possible values are: all: train on</span></span>
-<span id="cb1-569"><a href="#cb1-569" aria-hidden="true" tabindex="-1"></a><span class="co">  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable</span></span>
-<span id="cb1-570"><a href="#cb1-570" aria-hidden="true" tabindex="-1"></a><span class="co">  # turn, last: train on the last EOS token in the conversation</span></span>
-<span id="cb1-571"><a href="#cb1-571" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">train_on_eos</span><span class="kw">:</span><span class="at"> Literal['all', 'turn', 'last'] | None</span></span>
-<span id="cb1-572"><a href="#cb1-572" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All</span></span>
-<span id="cb1-573"><a href="#cb1-573" aria-hidden="true" tabindex="-1"></a><span class="co">  # source roles will be mapped to the target role. The default is: user: ["human",</span></span>
-<span id="cb1-574"><a href="#cb1-574" aria-hidden="true" tabindex="-1"></a><span class="co">  # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]</span></span>
-<span id="cb1-575"><a href="#cb1-575" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles</span><span class="kw">:</span><span class="at"> dict[str, list[str]] | None</span></span>
-<span id="cb1-576"><a href="#cb1-576" aria-hidden="true" tabindex="-1"></a><span class="co">  # Whether to drop the system turn from the dataset. Only works with chat_template.</span></span>
-<span id="cb1-577"><a href="#cb1-577" aria-hidden="true" tabindex="-1"></a><span class="co">  # This does not drop the default system message from chat_template if it exists. If</span></span>
-<span id="cb1-578"><a href="#cb1-578" aria-hidden="true" tabindex="-1"></a><span class="co">  # you wish to, we recommend using a custom jinja template with the default system</span></span>
-<span id="cb1-579"><a href="#cb1-579" aria-hidden="true" tabindex="-1"></a><span class="co">  # message removed or adding a system turn with empty content.</span></span>
-<span id="cb1-580"><a href="#cb1-580" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">drop_system_message</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-581"><a href="#cb1-581" aria-hidden="true" tabindex="-1"></a><span class="co">  # Trust remote code for untrusted source</span></span>
-<span id="cb1-582"><a href="#cb1-582" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-583"><a href="#cb1-583" aria-hidden="true" tabindex="-1"></a><span class="co">  # The specific revision of the dataset to use when loading from the Hugging Face Hub.</span></span>
-<span id="cb1-584"><a href="#cb1-584" aria-hidden="true" tabindex="-1"></a><span class="co">  # This can be a commit hash, tag, or branch name. If not specified, the latest version</span></span>
-<span id="cb1-585"><a href="#cb1-585" aria-hidden="true" tabindex="-1"></a><span class="co">  # will be used. This parameter is ignored for local datasets.</span></span>
-<span id="cb1-586"><a href="#cb1-586" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-587"><a href="#cb1-587" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-588"><a href="#cb1-588" aria-hidden="true" tabindex="-1"></a><span class="co">  # For DPODataset:</span></span>
-<span id="cb1-589"><a href="#cb1-589" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-590"><a href="#cb1-590" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-591"><a href="#cb1-591" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> UserDefinedDPOType | str | None</span></span>
-<span id="cb1-592"><a href="#cb1-592" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedDPOType:</span></span>
-<span id="cb1-593"><a href="#cb1-593" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-594"><a href="#cb1-594" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-595"><a href="#cb1-595" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_chosen</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-596"><a href="#cb1-596" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_rejected</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-597"><a href="#cb1-597" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">prompt_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-598"><a href="#cb1-598" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">chosen_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-599"><a href="#cb1-599" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">rejected_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-600"><a href="#cb1-600" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-601"><a href="#cb1-601" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-602"><a href="#cb1-602" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-603"><a href="#cb1-603" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-604"><a href="#cb1-604" aria-hidden="true" tabindex="-1"></a><span class="co">  # For KTODataset:</span></span>
-<span id="cb1-605"><a href="#cb1-605" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-606"><a href="#cb1-606" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-607"><a href="#cb1-607" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> UserDefinedKTOType | str | None</span></span>
-<span id="cb1-608"><a href="#cb1-608" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedKTOType:</span></span>
-<span id="cb1-609"><a href="#cb1-609" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-610"><a href="#cb1-610" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-611"><a href="#cb1-611" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_completion</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-612"><a href="#cb1-612" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_label</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-613"><a href="#cb1-613" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">prompt_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-614"><a href="#cb1-614" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">completion_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-615"><a href="#cb1-615" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-616"><a href="#cb1-616" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-617"><a href="#cb1-617" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-618"><a href="#cb1-618" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-619"><a href="#cb1-619" aria-hidden="true" tabindex="-1"></a><span class="co">  # For StepwiseSupervisedDataset:</span></span>
-<span id="cb1-620"><a href="#cb1-620" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-621"><a href="#cb1-621" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-622"><a href="#cb1-622" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-623"><a href="#cb1-623" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-624"><a href="#cb1-624" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">step_separator</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-625"><a href="#cb1-625" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">max_completion_length</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-626"><a href="#cb1-626" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">train_on_last_step_only</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-627"><a href="#cb1-627" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-628"><a href="#cb1-628" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SyntheticDataset:</span></span>
-<span id="cb1-629"><a href="#cb1-629" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> Literal['synthetic'] = synthetic</span></span>
-<span id="cb1-630"><a href="#cb1-630" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> Literal['_synthetic'] = _synthetic</span></span>
-<span id="cb1-631"><a href="#cb1-631" aria-hidden="true" tabindex="-1"></a><span class="co">  # Number of rows to generate</span></span>
-<span id="cb1-632"><a href="#cb1-632" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">length</span><span class="kw">:</span><span class="at"> int = 1000</span></span>
-<span id="cb1-633"><a href="#cb1-633" aria-hidden="true" tabindex="-1"></a><span class="co">  # Sequence length per row (defaults to sequence_len from config)</span></span>
-<span id="cb1-634"><a href="#cb1-634" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">sequence_length</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-635"><a href="#cb1-635" aria-hidden="true" tabindex="-1"></a><span class="co">  # Minimum token ID for generation</span></span>
-<span id="cb1-636"><a href="#cb1-636" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">min_input_id</span><span class="kw">:</span><span class="at"> int = 100</span></span>
-<span id="cb1-637"><a href="#cb1-637" aria-hidden="true" tabindex="-1"></a><span class="co">  # Maximum token ID for generation (defaults to tokenizer vocab_size)</span></span>
-<span id="cb1-638"><a href="#cb1-638" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">max_input_id</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-639"><a href="#cb1-639" aria-hidden="true" tabindex="-1"></a><span class="co">  # Random seed for reproducibility</span></span>
-<span id="cb1-640"><a href="#cb1-640" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">seed</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-641"><a href="#cb1-641" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-642"><a href="#cb1-642" aria-hidden="true" tabindex="-1"></a><span class="co"># If false, the datasets will not be shuffled and will keep their original order in</span></span>
-<span id="cb1-643"><a href="#cb1-643" aria-hidden="true" tabindex="-1"></a><span class="co"># `datasets`. The same applies to the `test_datasets` option and the</span></span>
-<span id="cb1-644"><a href="#cb1-644" aria-hidden="true" tabindex="-1"></a><span class="co"># `pretraining_dataset` option. Default is true.</span></span>
-<span id="cb1-645"><a href="#cb1-645" aria-hidden="true" tabindex="-1"></a><span class="fu">shuffle_merged_datasets</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-646"><a href="#cb1-646" aria-hidden="true" tabindex="-1"></a><span class="co"># If true, each dataset in `datasets` will be shuffled before merging. This allows</span></span>
-<span id="cb1-647"><a href="#cb1-647" aria-hidden="true" tabindex="-1"></a><span class="co"># curriculum learning strategies to be applied at the dataset level. Default is false.</span></span>
-<span id="cb1-648"><a href="#cb1-648" aria-hidden="true" tabindex="-1"></a><span class="fu">shuffle_before_merging_datasets</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-649"><a href="#cb1-649" aria-hidden="true" tabindex="-1"></a><span class="co"># Axolotl attempts to save the dataset as an arrow after packing the data together so</span></span>
-<span id="cb1-650"><a href="#cb1-650" aria-hidden="true" tabindex="-1"></a><span class="co"># subsequent training attempts load faster, relative path</span></span>
-<span id="cb1-651"><a href="#cb1-651" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_prepared_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-652"><a href="#cb1-652" aria-hidden="true" tabindex="-1"></a><span class="co"># Num shards for whole dataset</span></span>
-<span id="cb1-653"><a href="#cb1-653" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_shard_num</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-654"><a href="#cb1-654" aria-hidden="true" tabindex="-1"></a><span class="co"># Index of shard to use for whole dataset</span></span>
-<span id="cb1-655"><a href="#cb1-655" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_shard_idx</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-656"><a href="#cb1-656" aria-hidden="true" tabindex="-1"></a><span class="fu">skip_prepare_dataset</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-657"><a href="#cb1-657" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of shards to save the prepared dataset</span></span>
-<span id="cb1-658"><a href="#cb1-658" aria-hidden="true" tabindex="-1"></a><span class="fu">num_dataset_shards_to_save</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-659"><a href="#cb1-659" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-660"><a href="#cb1-660" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize</span></span>
-<span id="cb1-661"><a href="#cb1-661" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_dataset</span><span class="kw">:</span><span class="at"> Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None</span></span>
-<span id="cb1-662"><a href="#cb1-662" aria-hidden="true" tabindex="-1"></a><span class="co">  # For PretrainingDataset:</span></span>
-<span id="cb1-663"><a href="#cb1-663" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-664"><a href="#cb1-664" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-665"><a href="#cb1-665" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None = train</span></span>
-<span id="cb1-666"><a href="#cb1-666" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">text_column</span><span class="kw">:</span><span class="at"> str | None = text</span></span>
-<span id="cb1-667"><a href="#cb1-667" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> str | None = pretrain</span></span>
-<span id="cb1-668"><a href="#cb1-668" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-669"><a href="#cb1-669" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-670"><a href="#cb1-670" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">skip</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-671"><a href="#cb1-671" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-672"><a href="#cb1-672" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SFTDataset:</span></span>
-<span id="cb1-673"><a href="#cb1-673" aria-hidden="true" tabindex="-1"></a><span class="co">  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory</span></span>
-<span id="cb1-674"><a href="#cb1-674" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-675"><a href="#cb1-675" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset split to load from</span></span>
-<span id="cb1-676"><a href="#cb1-676" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-677"><a href="#cb1-677" aria-hidden="true" tabindex="-1"></a><span class="co">  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]</span></span>
-<span id="cb1-678"><a href="#cb1-678" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> str | UserDefinedPrompterType | None</span></span>
-<span id="cb1-679"><a href="#cb1-679" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedPrompterType:</span></span>
-<span id="cb1-680"><a href="#cb1-680" aria-hidden="true" tabindex="-1"></a><span class="co">    # Custom user instruction prompt</span></span>
-<span id="cb1-681"><a href="#cb1-681" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-682"><a href="#cb1-682" aria-hidden="true" tabindex="-1"></a><span class="co">    # Use {system} as key to be replaced</span></span>
-<span id="cb1-683"><a href="#cb1-683" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-684"><a href="#cb1-684" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-685"><a href="#cb1-685" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_instruction</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-686"><a href="#cb1-686" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_input</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-687"><a href="#cb1-687" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_output</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-688"><a href="#cb1-688" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-689"><a href="#cb1-689" aria-hidden="true" tabindex="-1"></a><span class="co">    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to</span></span>
-<span id="cb1-690"><a href="#cb1-690" aria-hidden="true" tabindex="-1"></a><span class="co">    # be replaced. 'format' can include {input}</span></span>
-<span id="cb1-691"><a href="#cb1-691" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-692"><a href="#cb1-692" aria-hidden="true" tabindex="-1"></a><span class="co">    # 'no_input_format' cannot include {input}</span></span>
-<span id="cb1-693"><a href="#cb1-693" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">no_input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-694"><a href="#cb1-694" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_transform</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-695"><a href="#cb1-695" aria-hidden="true" tabindex="-1"></a><span class="co">  # split dataset into N pieces (use with shards_idx)</span></span>
-<span id="cb1-696"><a href="#cb1-696" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-697"><a href="#cb1-697" aria-hidden="true" tabindex="-1"></a><span class="co">  # the index of sharded dataset to use</span></span>
-<span id="cb1-698"><a href="#cb1-698" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards_idx</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-699"><a href="#cb1-699" aria-hidden="true" tabindex="-1"></a><span class="co">  # process dataset in N sequential chunks for memory efficiency (exclusive with</span></span>
-<span id="cb1-700"><a href="#cb1-700" aria-hidden="true" tabindex="-1"></a><span class="co">  # `shards`)</span></span>
-<span id="cb1-701"><a href="#cb1-701" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">preprocess_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-702"><a href="#cb1-702" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">conversation</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-703"><a href="#cb1-703" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-704"><a href="#cb1-704" aria-hidden="true" tabindex="-1"></a><span class="co">  # The name of the chat template to use for training, following values are supported:</span></span>
-<span id="cb1-705"><a href="#cb1-705" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default: Uses the chat template that is available in the</span></span>
-<span id="cb1-706"><a href="#cb1-706" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_config.json. If the chat template is not available in the tokenizer, it</span></span>
-<span id="cb1-707"><a href="#cb1-707" aria-hidden="true" tabindex="-1"></a><span class="co">  # will raise an error. This is the default.</span></span>
-<span id="cb1-708"><a href="#cb1-708" aria-hidden="true" tabindex="-1"></a><span class="co">  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
-<span id="cb1-709"><a href="#cb1-709" aria-hidden="true" tabindex="-1"></a><span class="co">  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
-<span id="cb1-710"><a href="#cb1-710" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback</span></span>
-<span id="cb1-711"><a href="#cb1-711" aria-hidden="true" tabindex="-1"></a><span class="co">  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.</span></span>
-<span id="cb1-712"><a href="#cb1-712" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat</span></span>
-<span id="cb1-713"><a href="#cb1-713" aria-hidden="true" tabindex="-1"></a><span class="co">  # template. The custom jinja template should be provided in the chat_template_jinja</span></span>
-<span id="cb1-714"><a href="#cb1-714" aria-hidden="true" tabindex="-1"></a><span class="co">  # field.</span></span>
-<span id="cb1-715"><a href="#cb1-715" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | str | None</span></span>
-<span id="cb1-716"><a href="#cb1-716" aria-hidden="true" tabindex="-1"></a><span class="co">  # Custom jinja chat template or path to jinja file. Used only if `chat_template:</span></span>
-<span id="cb1-717"><a href="#cb1-717" aria-hidden="true" tabindex="-1"></a><span class="co">  # jinja` or empty.</span></span>
-<span id="cb1-718"><a href="#cb1-718" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-719"><a href="#cb1-719" aria-hidden="true" tabindex="-1"></a><span class="co">  # path to source data files</span></span>
-<span id="cb1-720"><a href="#cb1-720" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
-<span id="cb1-721"><a href="#cb1-721" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-722"><a href="#cb1-722" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset configuration to load</span></span>
-<span id="cb1-723"><a href="#cb1-723" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-724"><a href="#cb1-724" aria-hidden="true" tabindex="-1"></a><span class="co">  # defines the datatype when path is a file</span></span>
-<span id="cb1-725"><a href="#cb1-725" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">ds_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-726"><a href="#cb1-726" aria-hidden="true" tabindex="-1"></a><span class="co">  # For `completion` datasets only, uses the provided field instead of `text` column</span></span>
-<span id="cb1-727"><a href="#cb1-727" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-728"><a href="#cb1-728" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_human</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-729"><a href="#cb1-729" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-730"><a href="#cb1-730" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the messages (default: "messages")</span></span>
-<span id="cb1-731"><a href="#cb1-731" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-732"><a href="#cb1-732" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON</span></span>
-<span id="cb1-733"><a href="#cb1-733" aria-hidden="true" tabindex="-1"></a><span class="co">  # schema](https://json-schema.org/learn/getting-started-step-by-step).</span></span>
-<span id="cb1-734"><a href="#cb1-734" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_tools</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-735"><a href="#cb1-735" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the reasoning trace (default: "reasoning_content").</span></span>
-<span id="cb1-736"><a href="#cb1-736" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_thinking</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-737"><a href="#cb1-737" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key the chat template expects that indicates the reasoning trace.</span></span>
-<span id="cb1-738"><a href="#cb1-738" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">template_thinking_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-739"><a href="#cb1-739" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-740"><a href="#cb1-740" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_role</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-544"><a href="#cb1-544" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_role</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-545"><a href="#cb1-545" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-546"><a href="#cb1-546" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_content</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-547"><a href="#cb1-547" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mapping of properties from the input dataset to the chat template. (default:</span></span>
+<span id="cb1-548"><a href="#cb1-548" aria-hidden="true" tabindex="-1"></a><span class="co">  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists</span></span>
+<span id="cb1-549"><a href="#cb1-549" aria-hidden="true" tabindex="-1"></a><span class="co">  # in the template but not in this mapping, the system will attempt to load it directly</span></span>
+<span id="cb1-550"><a href="#cb1-550" aria-hidden="true" tabindex="-1"></a><span class="co">  # from the message using the property name as the key. Example: In the mapping below,</span></span>
+<span id="cb1-551"><a href="#cb1-551" aria-hidden="true" tabindex="-1"></a><span class="co">  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and</span></span>
+<span id="cb1-552"><a href="#cb1-552" aria-hidden="true" tabindex="-1"></a><span class="co">  # used as 'content' in the chat template.</span></span>
+<span id="cb1-553"><a href="#cb1-553" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_property_mappings</span><span class="kw">:</span><span class="at"> dict[str, str] | None</span></span>
+<span id="cb1-554"><a href="#cb1-554" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that indicates via boolean whether tokens of a turn</span></span>
+<span id="cb1-555"><a href="#cb1-555" aria-hidden="true" tabindex="-1"></a><span class="co">  # should be considered for training. Useful to selectively train on certain turns</span></span>
+<span id="cb1-556"><a href="#cb1-556" aria-hidden="true" tabindex="-1"></a><span class="co">  # besides the `roles_to_train`.</span></span>
+<span id="cb1-557"><a href="#cb1-557" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-558"><a href="#cb1-558" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that contains the training details. Useful to</span></span>
+<span id="cb1-559"><a href="#cb1-559" aria-hidden="true" tabindex="-1"></a><span class="co">  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]</span></span>
+<span id="cb1-560"><a href="#cb1-560" aria-hidden="true" tabindex="-1"></a><span class="co">  # containing `begin_offset` (start character index in content), `end_offset` (end</span></span>
+<span id="cb1-561"><a href="#cb1-561" aria-hidden="true" tabindex="-1"></a><span class="co">  # character index in content), and `train` (boolean whether to train).</span></span>
+<span id="cb1-562"><a href="#cb1-562" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training_detail</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-563"><a href="#cb1-563" aria-hidden="true" tabindex="-1"></a><span class="co">  # (for Qwen3 template only) Whether to split the assistant content based on a</span></span>
+<span id="cb1-564"><a href="#cb1-564" aria-hidden="true" tabindex="-1"></a><span class="co">  # reasoning trace inside delimited tags</span></span>
+<span id="cb1-565"><a href="#cb1-565" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split_thinking</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-566"><a href="#cb1-566" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">logprobs_field</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-567"><a href="#cb1-567" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-568"><a href="#cb1-568" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles to train on. The tokens from these roles will be considered for the loss.</span></span>
+<span id="cb1-569"><a href="#cb1-569" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles_to_train</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-570"><a href="#cb1-570" aria-hidden="true" tabindex="-1"></a><span class="co">  # Which EOS tokens to train on in the conversation. Possible values are: all: train on</span></span>
+<span id="cb1-571"><a href="#cb1-571" aria-hidden="true" tabindex="-1"></a><span class="co">  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable</span></span>
+<span id="cb1-572"><a href="#cb1-572" aria-hidden="true" tabindex="-1"></a><span class="co">  # turn, last: train on the last EOS token in the conversation</span></span>
+<span id="cb1-573"><a href="#cb1-573" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">train_on_eos</span><span class="kw">:</span><span class="at"> Literal['all', 'turn', 'last'] | None</span></span>
+<span id="cb1-574"><a href="#cb1-574" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All</span></span>
+<span id="cb1-575"><a href="#cb1-575" aria-hidden="true" tabindex="-1"></a><span class="co">  # source roles will be mapped to the target role. The default is: user: ["human",</span></span>
+<span id="cb1-576"><a href="#cb1-576" aria-hidden="true" tabindex="-1"></a><span class="co">  # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]</span></span>
+<span id="cb1-577"><a href="#cb1-577" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles</span><span class="kw">:</span><span class="at"> dict[str, list[str]] | None</span></span>
+<span id="cb1-578"><a href="#cb1-578" aria-hidden="true" tabindex="-1"></a><span class="co">  # Whether to drop the system turn from the dataset. Only works with chat_template.</span></span>
+<span id="cb1-579"><a href="#cb1-579" aria-hidden="true" tabindex="-1"></a><span class="co">  # This does not drop the default system message from chat_template if it exists. If</span></span>
+<span id="cb1-580"><a href="#cb1-580" aria-hidden="true" tabindex="-1"></a><span class="co">  # you wish to, we recommend using a custom jinja template with the default system</span></span>
+<span id="cb1-581"><a href="#cb1-581" aria-hidden="true" tabindex="-1"></a><span class="co">  # message removed or adding a system turn with empty content.</span></span>
+<span id="cb1-582"><a href="#cb1-582" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">drop_system_message</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-583"><a href="#cb1-583" aria-hidden="true" tabindex="-1"></a><span class="co">  # Trust remote code for untrusted source</span></span>
+<span id="cb1-584"><a href="#cb1-584" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-585"><a href="#cb1-585" aria-hidden="true" tabindex="-1"></a><span class="co">  # The specific revision of the dataset to use when loading from the Hugging Face Hub.</span></span>
+<span id="cb1-586"><a href="#cb1-586" aria-hidden="true" tabindex="-1"></a><span class="co">  # This can be a commit hash, tag, or branch name. If not specified, the latest version</span></span>
+<span id="cb1-587"><a href="#cb1-587" aria-hidden="true" tabindex="-1"></a><span class="co">  # will be used. This parameter is ignored for local datasets.</span></span>
+<span id="cb1-588"><a href="#cb1-588" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-589"><a href="#cb1-589" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-590"><a href="#cb1-590" aria-hidden="true" tabindex="-1"></a><span class="co">  # For DPODataset:</span></span>
+<span id="cb1-591"><a href="#cb1-591" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-592"><a href="#cb1-592" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-593"><a href="#cb1-593" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> UserDefinedDPOType | str | None</span></span>
+<span id="cb1-594"><a href="#cb1-594" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedDPOType:</span></span>
+<span id="cb1-595"><a href="#cb1-595" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-596"><a href="#cb1-596" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-597"><a href="#cb1-597" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_chosen</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-598"><a href="#cb1-598" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_rejected</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-599"><a href="#cb1-599" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">prompt_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-600"><a href="#cb1-600" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">chosen_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-601"><a href="#cb1-601" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">rejected_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-602"><a href="#cb1-602" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-603"><a href="#cb1-603" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-604"><a href="#cb1-604" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-605"><a href="#cb1-605" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-606"><a href="#cb1-606" aria-hidden="true" tabindex="-1"></a><span class="co">  # For KTODataset:</span></span>
+<span id="cb1-607"><a href="#cb1-607" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-608"><a href="#cb1-608" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-609"><a href="#cb1-609" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> UserDefinedKTOType | str | None</span></span>
+<span id="cb1-610"><a href="#cb1-610" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedKTOType:</span></span>
+<span id="cb1-611"><a href="#cb1-611" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-612"><a href="#cb1-612" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-613"><a href="#cb1-613" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_completion</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-614"><a href="#cb1-614" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_label</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-615"><a href="#cb1-615" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">prompt_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-616"><a href="#cb1-616" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">completion_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-617"><a href="#cb1-617" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-618"><a href="#cb1-618" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-619"><a href="#cb1-619" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-620"><a href="#cb1-620" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-621"><a href="#cb1-621" aria-hidden="true" tabindex="-1"></a><span class="co">  # For StepwiseSupervisedDataset:</span></span>
+<span id="cb1-622"><a href="#cb1-622" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-623"><a href="#cb1-623" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-624"><a href="#cb1-624" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-625"><a href="#cb1-625" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-626"><a href="#cb1-626" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">step_separator</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-627"><a href="#cb1-627" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">max_completion_length</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-628"><a href="#cb1-628" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">train_on_last_step_only</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-629"><a href="#cb1-629" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-630"><a href="#cb1-630" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SyntheticDataset:</span></span>
+<span id="cb1-631"><a href="#cb1-631" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> Literal['synthetic'] = synthetic</span></span>
+<span id="cb1-632"><a href="#cb1-632" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> Literal['_synthetic'] = _synthetic</span></span>
+<span id="cb1-633"><a href="#cb1-633" aria-hidden="true" tabindex="-1"></a><span class="co">  # Number of rows to generate</span></span>
+<span id="cb1-634"><a href="#cb1-634" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">length</span><span class="kw">:</span><span class="at"> int = 1000</span></span>
+<span id="cb1-635"><a href="#cb1-635" aria-hidden="true" tabindex="-1"></a><span class="co">  # Sequence length per row (defaults to sequence_len from config)</span></span>
+<span id="cb1-636"><a href="#cb1-636" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">sequence_length</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-637"><a href="#cb1-637" aria-hidden="true" tabindex="-1"></a><span class="co">  # Minimum token ID for generation</span></span>
+<span id="cb1-638"><a href="#cb1-638" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">min_input_id</span><span class="kw">:</span><span class="at"> int = 100</span></span>
+<span id="cb1-639"><a href="#cb1-639" aria-hidden="true" tabindex="-1"></a><span class="co">  # Maximum token ID for generation (defaults to tokenizer vocab_size)</span></span>
+<span id="cb1-640"><a href="#cb1-640" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">max_input_id</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-641"><a href="#cb1-641" aria-hidden="true" tabindex="-1"></a><span class="co">  # Random seed for reproducibility</span></span>
+<span id="cb1-642"><a href="#cb1-642" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">seed</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-643"><a href="#cb1-643" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-644"><a href="#cb1-644" aria-hidden="true" tabindex="-1"></a><span class="co"># If false, the datasets will not be shuffled and will keep their original order in</span></span>
+<span id="cb1-645"><a href="#cb1-645" aria-hidden="true" tabindex="-1"></a><span class="co"># `datasets`. The same applies to the `test_datasets` option and the</span></span>
+<span id="cb1-646"><a href="#cb1-646" aria-hidden="true" tabindex="-1"></a><span class="co"># `pretraining_dataset` option. Default is true.</span></span>
+<span id="cb1-647"><a href="#cb1-647" aria-hidden="true" tabindex="-1"></a><span class="fu">shuffle_merged_datasets</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-648"><a href="#cb1-648" aria-hidden="true" tabindex="-1"></a><span class="co"># If true, each dataset in `datasets` will be shuffled before merging. This allows</span></span>
+<span id="cb1-649"><a href="#cb1-649" aria-hidden="true" tabindex="-1"></a><span class="co"># curriculum learning strategies to be applied at the dataset level. Default is false.</span></span>
+<span id="cb1-650"><a href="#cb1-650" aria-hidden="true" tabindex="-1"></a><span class="fu">shuffle_before_merging_datasets</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-651"><a href="#cb1-651" aria-hidden="true" tabindex="-1"></a><span class="co"># Axolotl attempts to save the dataset as an arrow after packing the data together so</span></span>
+<span id="cb1-652"><a href="#cb1-652" aria-hidden="true" tabindex="-1"></a><span class="co"># subsequent training attempts load faster, relative path</span></span>
+<span id="cb1-653"><a href="#cb1-653" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_prepared_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-654"><a href="#cb1-654" aria-hidden="true" tabindex="-1"></a><span class="co"># Num shards for whole dataset</span></span>
+<span id="cb1-655"><a href="#cb1-655" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_shard_num</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-656"><a href="#cb1-656" aria-hidden="true" tabindex="-1"></a><span class="co"># Index of shard to use for whole dataset</span></span>
+<span id="cb1-657"><a href="#cb1-657" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_shard_idx</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-658"><a href="#cb1-658" aria-hidden="true" tabindex="-1"></a><span class="fu">skip_prepare_dataset</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-659"><a href="#cb1-659" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of shards to save the prepared dataset</span></span>
+<span id="cb1-660"><a href="#cb1-660" aria-hidden="true" tabindex="-1"></a><span class="fu">num_dataset_shards_to_save</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-661"><a href="#cb1-661" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-662"><a href="#cb1-662" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize</span></span>
+<span id="cb1-663"><a href="#cb1-663" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_dataset</span><span class="kw">:</span><span class="at"> Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None</span></span>
+<span id="cb1-664"><a href="#cb1-664" aria-hidden="true" tabindex="-1"></a><span class="co">  # For PretrainingDataset:</span></span>
+<span id="cb1-665"><a href="#cb1-665" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-666"><a href="#cb1-666" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-667"><a href="#cb1-667" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None = train</span></span>
+<span id="cb1-668"><a href="#cb1-668" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">text_column</span><span class="kw">:</span><span class="at"> str | None = text</span></span>
+<span id="cb1-669"><a href="#cb1-669" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> str | None = pretrain</span></span>
+<span id="cb1-670"><a href="#cb1-670" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-671"><a href="#cb1-671" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-672"><a href="#cb1-672" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">skip</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-673"><a href="#cb1-673" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-674"><a href="#cb1-674" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SFTDataset:</span></span>
+<span id="cb1-675"><a href="#cb1-675" aria-hidden="true" tabindex="-1"></a><span class="co">  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory</span></span>
+<span id="cb1-676"><a href="#cb1-676" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-677"><a href="#cb1-677" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset split to load from</span></span>
+<span id="cb1-678"><a href="#cb1-678" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-679"><a href="#cb1-679" aria-hidden="true" tabindex="-1"></a><span class="co">  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]</span></span>
+<span id="cb1-680"><a href="#cb1-680" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">type</span><span class="kw">:</span><span class="at"> str | UserDefinedPrompterType | None</span></span>
+<span id="cb1-681"><a href="#cb1-681" aria-hidden="true" tabindex="-1"></a><span class="co">    # For UserDefinedPrompterType:</span></span>
+<span id="cb1-682"><a href="#cb1-682" aria-hidden="true" tabindex="-1"></a><span class="co">    # Custom user instruction prompt</span></span>
+<span id="cb1-683"><a href="#cb1-683" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_prompt</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-684"><a href="#cb1-684" aria-hidden="true" tabindex="-1"></a><span class="co">    # Use {system} as key to be replaced</span></span>
+<span id="cb1-685"><a href="#cb1-685" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">system_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-686"><a href="#cb1-686" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_system</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-687"><a href="#cb1-687" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_instruction</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-688"><a href="#cb1-688" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_input</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-689"><a href="#cb1-689" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">field_output</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-690"><a href="#cb1-690" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-691"><a href="#cb1-691" aria-hidden="true" tabindex="-1"></a><span class="co">    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to</span></span>
+<span id="cb1-692"><a href="#cb1-692" aria-hidden="true" tabindex="-1"></a><span class="co">    # be replaced. 'format' can include {input}</span></span>
+<span id="cb1-693"><a href="#cb1-693" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-694"><a href="#cb1-694" aria-hidden="true" tabindex="-1"></a><span class="co">    # 'no_input_format' cannot include {input}</span></span>
+<span id="cb1-695"><a href="#cb1-695" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">no_input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-696"><a href="#cb1-696" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_transform</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-697"><a href="#cb1-697" aria-hidden="true" tabindex="-1"></a><span class="co">  # split dataset into N pieces (use with shards_idx)</span></span>
+<span id="cb1-698"><a href="#cb1-698" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-699"><a href="#cb1-699" aria-hidden="true" tabindex="-1"></a><span class="co">  # the index of sharded dataset to use</span></span>
+<span id="cb1-700"><a href="#cb1-700" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">shards_idx</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-701"><a href="#cb1-701" aria-hidden="true" tabindex="-1"></a><span class="co">  # process dataset in N sequential chunks for memory efficiency (exclusive with</span></span>
+<span id="cb1-702"><a href="#cb1-702" aria-hidden="true" tabindex="-1"></a><span class="co">  # `shards`)</span></span>
+<span id="cb1-703"><a href="#cb1-703" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">preprocess_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-704"><a href="#cb1-704" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">conversation</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-705"><a href="#cb1-705" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-706"><a href="#cb1-706" aria-hidden="true" tabindex="-1"></a><span class="co">  # The name of the chat template to use for training, following values are supported:</span></span>
+<span id="cb1-707"><a href="#cb1-707" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default: Uses the chat template that is available in the</span></span>
+<span id="cb1-708"><a href="#cb1-708" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_config.json. If the chat template is not available in the tokenizer, it</span></span>
+<span id="cb1-709"><a href="#cb1-709" aria-hidden="true" tabindex="-1"></a><span class="co">  # will raise an error. This is the default.</span></span>
+<span id="cb1-710"><a href="#cb1-710" aria-hidden="true" tabindex="-1"></a><span class="co">  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
+<span id="cb1-711"><a href="#cb1-711" aria-hidden="true" tabindex="-1"></a><span class="co">  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
+<span id="cb1-712"><a href="#cb1-712" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback</span></span>
+<span id="cb1-713"><a href="#cb1-713" aria-hidden="true" tabindex="-1"></a><span class="co">  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.</span></span>
+<span id="cb1-714"><a href="#cb1-714" aria-hidden="true" tabindex="-1"></a><span class="co">  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat</span></span>
+<span id="cb1-715"><a href="#cb1-715" aria-hidden="true" tabindex="-1"></a><span class="co">  # template. The custom jinja template should be provided in the chat_template_jinja</span></span>
+<span id="cb1-716"><a href="#cb1-716" aria-hidden="true" tabindex="-1"></a><span class="co">  # field.</span></span>
+<span id="cb1-717"><a href="#cb1-717" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | str | None</span></span>
+<span id="cb1-718"><a href="#cb1-718" aria-hidden="true" tabindex="-1"></a><span class="co">  # Custom jinja chat template or path to jinja file. Used only if `chat_template:</span></span>
+<span id="cb1-719"><a href="#cb1-719" aria-hidden="true" tabindex="-1"></a><span class="co">  # jinja` or empty.</span></span>
+<span id="cb1-720"><a href="#cb1-720" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-721"><a href="#cb1-721" aria-hidden="true" tabindex="-1"></a><span class="co">  # path to source data files</span></span>
+<span id="cb1-722"><a href="#cb1-722" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
+<span id="cb1-723"><a href="#cb1-723" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">input_format</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-724"><a href="#cb1-724" aria-hidden="true" tabindex="-1"></a><span class="co">  # name of dataset configuration to load</span></span>
+<span id="cb1-725"><a href="#cb1-725" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-726"><a href="#cb1-726" aria-hidden="true" tabindex="-1"></a><span class="co">  # defines the datatype when path is a file</span></span>
+<span id="cb1-727"><a href="#cb1-727" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">ds_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-728"><a href="#cb1-728" aria-hidden="true" tabindex="-1"></a><span class="co">  # For `completion` datasets only, uses the provided field instead of `text` column</span></span>
+<span id="cb1-729"><a href="#cb1-729" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-730"><a href="#cb1-730" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_human</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-731"><a href="#cb1-731" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-732"><a href="#cb1-732" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the messages (default: "messages")</span></span>
+<span id="cb1-733"><a href="#cb1-733" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-734"><a href="#cb1-734" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON</span></span>
+<span id="cb1-735"><a href="#cb1-735" aria-hidden="true" tabindex="-1"></a><span class="co">  # schema](https://json-schema.org/learn/getting-started-step-by-step).</span></span>
+<span id="cb1-736"><a href="#cb1-736" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_tools</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-737"><a href="#cb1-737" aria-hidden="true" tabindex="-1"></a><span class="co">  # Key containing the reasoning trace (default: "reasoning_content").</span></span>
+<span id="cb1-738"><a href="#cb1-738" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">field_thinking</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-739"><a href="#cb1-739" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key the chat template expects that indicates the reasoning trace.</span></span>
+<span id="cb1-740"><a href="#cb1-740" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">template_thinking_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
 <span id="cb1-741"><a href="#cb1-741" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-742"><a href="#cb1-742" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_content</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-743"><a href="#cb1-743" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mapping of properties from the input dataset to the chat template. (default:</span></span>
-<span id="cb1-744"><a href="#cb1-744" aria-hidden="true" tabindex="-1"></a><span class="co">  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists</span></span>
-<span id="cb1-745"><a href="#cb1-745" aria-hidden="true" tabindex="-1"></a><span class="co">  # in the template but not in this mapping, the system will attempt to load it directly</span></span>
-<span id="cb1-746"><a href="#cb1-746" aria-hidden="true" tabindex="-1"></a><span class="co">  # from the message using the property name as the key. Example: In the mapping below,</span></span>
-<span id="cb1-747"><a href="#cb1-747" aria-hidden="true" tabindex="-1"></a><span class="co">  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and</span></span>
-<span id="cb1-748"><a href="#cb1-748" aria-hidden="true" tabindex="-1"></a><span class="co">  # used as 'content' in the chat template.</span></span>
-<span id="cb1-749"><a href="#cb1-749" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_property_mappings</span><span class="kw">:</span><span class="at"> dict[str, str] | None</span></span>
-<span id="cb1-750"><a href="#cb1-750" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that indicates via boolean whether tokens of a turn</span></span>
-<span id="cb1-751"><a href="#cb1-751" aria-hidden="true" tabindex="-1"></a><span class="co">  # should be considered for training. Useful to selectively train on certain turns</span></span>
-<span id="cb1-752"><a href="#cb1-752" aria-hidden="true" tabindex="-1"></a><span class="co">  # besides the `roles_to_train`.</span></span>
-<span id="cb1-753"><a href="#cb1-753" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-754"><a href="#cb1-754" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that contains the training details. Useful to</span></span>
-<span id="cb1-755"><a href="#cb1-755" aria-hidden="true" tabindex="-1"></a><span class="co">  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]</span></span>
-<span id="cb1-756"><a href="#cb1-756" aria-hidden="true" tabindex="-1"></a><span class="co">  # containing `begin_offset` (start character index in content), `end_offset` (end</span></span>
-<span id="cb1-757"><a href="#cb1-757" aria-hidden="true" tabindex="-1"></a><span class="co">  # character index in content), and `train` (boolean whether to train).</span></span>
-<span id="cb1-758"><a href="#cb1-758" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training_detail</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-759"><a href="#cb1-759" aria-hidden="true" tabindex="-1"></a><span class="co">  # (for Qwen3 template only) Whether to split the assistant content based on a</span></span>
-<span id="cb1-760"><a href="#cb1-760" aria-hidden="true" tabindex="-1"></a><span class="co">  # reasoning trace inside delimited tags</span></span>
-<span id="cb1-761"><a href="#cb1-761" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split_thinking</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-762"><a href="#cb1-762" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">logprobs_field</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-763"><a href="#cb1-763" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-764"><a href="#cb1-764" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles to train on. The tokens from these roles will be considered for the loss.</span></span>
-<span id="cb1-765"><a href="#cb1-765" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles_to_train</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-766"><a href="#cb1-766" aria-hidden="true" tabindex="-1"></a><span class="co">  # Which EOS tokens to train on in the conversation. Possible values are: all: train on</span></span>
-<span id="cb1-767"><a href="#cb1-767" aria-hidden="true" tabindex="-1"></a><span class="co">  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable</span></span>
-<span id="cb1-768"><a href="#cb1-768" aria-hidden="true" tabindex="-1"></a><span class="co">  # turn, last: train on the last EOS token in the conversation</span></span>
-<span id="cb1-769"><a href="#cb1-769" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">train_on_eos</span><span class="kw">:</span><span class="at"> Literal['all', 'turn', 'last'] | None</span></span>
-<span id="cb1-770"><a href="#cb1-770" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All</span></span>
-<span id="cb1-771"><a href="#cb1-771" aria-hidden="true" tabindex="-1"></a><span class="co">  # source roles will be mapped to the target role. The default is: user: ["human",</span></span>
-<span id="cb1-772"><a href="#cb1-772" aria-hidden="true" tabindex="-1"></a><span class="co">  # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]</span></span>
-<span id="cb1-773"><a href="#cb1-773" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles</span><span class="kw">:</span><span class="at"> dict[str, list[str]] | None</span></span>
-<span id="cb1-774"><a href="#cb1-774" aria-hidden="true" tabindex="-1"></a><span class="co">  # Whether to drop the system turn from the dataset. Only works with chat_template.</span></span>
-<span id="cb1-775"><a href="#cb1-775" aria-hidden="true" tabindex="-1"></a><span class="co">  # This does not drop the default system message from chat_template if it exists. If</span></span>
-<span id="cb1-776"><a href="#cb1-776" aria-hidden="true" tabindex="-1"></a><span class="co">  # you wish to, we recommend using a custom jinja template with the default system</span></span>
-<span id="cb1-777"><a href="#cb1-777" aria-hidden="true" tabindex="-1"></a><span class="co">  # message removed or adding a system turn with empty content.</span></span>
-<span id="cb1-778"><a href="#cb1-778" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">drop_system_message</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-779"><a href="#cb1-779" aria-hidden="true" tabindex="-1"></a><span class="co">  # Trust remote code for untrusted source</span></span>
-<span id="cb1-780"><a href="#cb1-780" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-781"><a href="#cb1-781" aria-hidden="true" tabindex="-1"></a><span class="co">  # The specific revision of the dataset to use when loading from the Hugging Face Hub.</span></span>
-<span id="cb1-782"><a href="#cb1-782" aria-hidden="true" tabindex="-1"></a><span class="co">  # This can be a commit hash, tag, or branch name. If not specified, the latest version</span></span>
-<span id="cb1-783"><a href="#cb1-783" aria-hidden="true" tabindex="-1"></a><span class="co">  # will be used. This parameter is ignored for local datasets.</span></span>
-<span id="cb1-784"><a href="#cb1-784" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-785"><a href="#cb1-785" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-786"><a href="#cb1-786" aria-hidden="true" tabindex="-1"></a><span class="co"># The maximum number of processes to use while preprocessing your input dataset. This</span></span>
-<span id="cb1-787"><a href="#cb1-787" aria-hidden="true" tabindex="-1"></a><span class="co"># defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of</span></span>
-<span id="cb1-788"><a href="#cb1-788" aria-hidden="true" tabindex="-1"></a><span class="co"># vCPUs via RUNPOD_CPU_COUNT.</span></span>
-<span id="cb1-789"><a href="#cb1-789" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_processes</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-790"><a href="#cb1-790" aria-hidden="true" tabindex="-1"></a><span class="co"># The maximum number of processes to use while preprocessing your input dataset. This</span></span>
-<span id="cb1-791"><a href="#cb1-791" aria-hidden="true" tabindex="-1"></a><span class="co"># defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of</span></span>
-<span id="cb1-792"><a href="#cb1-792" aria-hidden="true" tabindex="-1"></a><span class="co"># vCPUs via RUNPOD_CPU_COUNT.</span></span>
-<span id="cb1-793"><a href="#cb1-793" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_num_proc</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-794"><a href="#cb1-794" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-795"><a href="#cb1-795" aria-hidden="true" tabindex="-1"></a><span class="co"># Deduplicates datasets and test_datasets with identical entries</span></span>
-<span id="cb1-796"><a href="#cb1-796" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_exact_deduplication</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-797"><a href="#cb1-797" aria-hidden="true" tabindex="-1"></a><span class="co"># Keep dataset in memory while preprocessing. Only needed if cached dataset is taking</span></span>
-<span id="cb1-798"><a href="#cb1-798" aria-hidden="true" tabindex="-1"></a><span class="co"># too much storage</span></span>
-<span id="cb1-799"><a href="#cb1-799" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_keep_in_memory</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-800"><a href="#cb1-800" aria-hidden="true" tabindex="-1"></a><span class="fu">dataloader_pin_memory</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-801"><a href="#cb1-801" aria-hidden="true" tabindex="-1"></a><span class="fu">dataloader_num_workers</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-802"><a href="#cb1-802" aria-hidden="true" tabindex="-1"></a><span class="fu">dataloader_prefetch_factor</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-803"><a href="#cb1-803" aria-hidden="true" tabindex="-1"></a><span class="fu">dataloader_drop_last</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-804"><a href="#cb1-804" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-805"><a href="#cb1-805" aria-hidden="true" tabindex="-1"></a><span class="fu">accelerator_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-742"><a href="#cb1-742" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_role</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-743"><a href="#cb1-743" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-744"><a href="#cb1-744" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_content</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-745"><a href="#cb1-745" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mapping of properties from the input dataset to the chat template. (default:</span></span>
+<span id="cb1-746"><a href="#cb1-746" aria-hidden="true" tabindex="-1"></a><span class="co">  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists</span></span>
+<span id="cb1-747"><a href="#cb1-747" aria-hidden="true" tabindex="-1"></a><span class="co">  # in the template but not in this mapping, the system will attempt to load it directly</span></span>
+<span id="cb1-748"><a href="#cb1-748" aria-hidden="true" tabindex="-1"></a><span class="co">  # from the message using the property name as the key. Example: In the mapping below,</span></span>
+<span id="cb1-749"><a href="#cb1-749" aria-hidden="true" tabindex="-1"></a><span class="co">  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and</span></span>
+<span id="cb1-750"><a href="#cb1-750" aria-hidden="true" tabindex="-1"></a><span class="co">  # used as 'content' in the chat template.</span></span>
+<span id="cb1-751"><a href="#cb1-751" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_property_mappings</span><span class="kw">:</span><span class="at"> dict[str, str] | None</span></span>
+<span id="cb1-752"><a href="#cb1-752" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that indicates via boolean whether tokens of a turn</span></span>
+<span id="cb1-753"><a href="#cb1-753" aria-hidden="true" tabindex="-1"></a><span class="co">  # should be considered for training. Useful to selectively train on certain turns</span></span>
+<span id="cb1-754"><a href="#cb1-754" aria-hidden="true" tabindex="-1"></a><span class="co">  # besides the `roles_to_train`.</span></span>
+<span id="cb1-755"><a href="#cb1-755" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-756"><a href="#cb1-756" aria-hidden="true" tabindex="-1"></a><span class="co">  # The key in the message turn that contains the training details. Useful to</span></span>
+<span id="cb1-757"><a href="#cb1-757" aria-hidden="true" tabindex="-1"></a><span class="co">  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]</span></span>
+<span id="cb1-758"><a href="#cb1-758" aria-hidden="true" tabindex="-1"></a><span class="co">  # containing `begin_offset` (start character index in content), `end_offset` (end</span></span>
+<span id="cb1-759"><a href="#cb1-759" aria-hidden="true" tabindex="-1"></a><span class="co">  # character index in content), and `train` (boolean whether to train).</span></span>
+<span id="cb1-760"><a href="#cb1-760" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">message_field_training_detail</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-761"><a href="#cb1-761" aria-hidden="true" tabindex="-1"></a><span class="co">  # (for Qwen3 template only) Whether to split the assistant content based on a</span></span>
+<span id="cb1-762"><a href="#cb1-762" aria-hidden="true" tabindex="-1"></a><span class="co">  # reasoning trace inside delimited tags</span></span>
+<span id="cb1-763"><a href="#cb1-763" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">split_thinking</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-764"><a href="#cb1-764" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">logprobs_field</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-765"><a href="#cb1-765" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-766"><a href="#cb1-766" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles to train on. The tokens from these roles will be considered for the loss.</span></span>
+<span id="cb1-767"><a href="#cb1-767" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles_to_train</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-768"><a href="#cb1-768" aria-hidden="true" tabindex="-1"></a><span class="co">  # Which EOS tokens to train on in the conversation. Possible values are: all: train on</span></span>
+<span id="cb1-769"><a href="#cb1-769" aria-hidden="true" tabindex="-1"></a><span class="co">  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable</span></span>
+<span id="cb1-770"><a href="#cb1-770" aria-hidden="true" tabindex="-1"></a><span class="co">  # turn, last: train on the last EOS token in the conversation</span></span>
+<span id="cb1-771"><a href="#cb1-771" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">train_on_eos</span><span class="kw">:</span><span class="at"> Literal['all', 'turn', 'last'] | None</span></span>
+<span id="cb1-772"><a href="#cb1-772" aria-hidden="true" tabindex="-1"></a><span class="co">  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All</span></span>
+<span id="cb1-773"><a href="#cb1-773" aria-hidden="true" tabindex="-1"></a><span class="co">  # source roles will be mapped to the target role. The default is: user: ["human",</span></span>
+<span id="cb1-774"><a href="#cb1-774" aria-hidden="true" tabindex="-1"></a><span class="co">  # "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]</span></span>
+<span id="cb1-775"><a href="#cb1-775" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">roles</span><span class="kw">:</span><span class="at"> dict[str, list[str]] | None</span></span>
+<span id="cb1-776"><a href="#cb1-776" aria-hidden="true" tabindex="-1"></a><span class="co">  # Whether to drop the system turn from the dataset. Only works with chat_template.</span></span>
+<span id="cb1-777"><a href="#cb1-777" aria-hidden="true" tabindex="-1"></a><span class="co">  # This does not drop the default system message from chat_template if it exists. If</span></span>
+<span id="cb1-778"><a href="#cb1-778" aria-hidden="true" tabindex="-1"></a><span class="co">  # you wish to, we recommend using a custom jinja template with the default system</span></span>
+<span id="cb1-779"><a href="#cb1-779" aria-hidden="true" tabindex="-1"></a><span class="co">  # message removed or adding a system turn with empty content.</span></span>
+<span id="cb1-780"><a href="#cb1-780" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">drop_system_message</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-781"><a href="#cb1-781" aria-hidden="true" tabindex="-1"></a><span class="co">  # Trust remote code for untrusted source</span></span>
+<span id="cb1-782"><a href="#cb1-782" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-783"><a href="#cb1-783" aria-hidden="true" tabindex="-1"></a><span class="co">  # The specific revision of the dataset to use when loading from the Hugging Face Hub.</span></span>
+<span id="cb1-784"><a href="#cb1-784" aria-hidden="true" tabindex="-1"></a><span class="co">  # This can be a commit hash, tag, or branch name. If not specified, the latest version</span></span>
+<span id="cb1-785"><a href="#cb1-785" aria-hidden="true" tabindex="-1"></a><span class="co">  # will be used. This parameter is ignored for local datasets.</span></span>
+<span id="cb1-786"><a href="#cb1-786" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-787"><a href="#cb1-787" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-788"><a href="#cb1-788" aria-hidden="true" tabindex="-1"></a><span class="co"># The maximum number of processes to use while preprocessing your input dataset. This</span></span>
+<span id="cb1-789"><a href="#cb1-789" aria-hidden="true" tabindex="-1"></a><span class="co"># defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of</span></span>
+<span id="cb1-790"><a href="#cb1-790" aria-hidden="true" tabindex="-1"></a><span class="co"># vCPUs via RUNPOD_CPU_COUNT.</span></span>
+<span id="cb1-791"><a href="#cb1-791" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_processes</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-792"><a href="#cb1-792" aria-hidden="true" tabindex="-1"></a><span class="co"># The maximum number of processes to use while preprocessing your input dataset. This</span></span>
+<span id="cb1-793"><a href="#cb1-793" aria-hidden="true" tabindex="-1"></a><span class="co"># defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of</span></span>
+<span id="cb1-794"><a href="#cb1-794" aria-hidden="true" tabindex="-1"></a><span class="co"># vCPUs via RUNPOD_CPU_COUNT.</span></span>
+<span id="cb1-795"><a href="#cb1-795" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_num_proc</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-796"><a href="#cb1-796" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-797"><a href="#cb1-797" aria-hidden="true" tabindex="-1"></a><span class="co"># Deduplicates datasets and test_datasets with identical entries</span></span>
+<span id="cb1-798"><a href="#cb1-798" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_exact_deduplication</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-799"><a href="#cb1-799" aria-hidden="true" tabindex="-1"></a><span class="co"># Keep dataset in memory while preprocessing. Only needed if cached dataset is taking</span></span>
+<span id="cb1-800"><a href="#cb1-800" aria-hidden="true" tabindex="-1"></a><span class="co"># too much storage</span></span>
+<span id="cb1-801"><a href="#cb1-801" aria-hidden="true" tabindex="-1"></a><span class="fu">dataset_keep_in_memory</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-802"><a href="#cb1-802" aria-hidden="true" tabindex="-1"></a><span class="fu">dataloader_pin_memory</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-803"><a href="#cb1-803" aria-hidden="true" tabindex="-1"></a><span class="fu">dataloader_num_workers</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-804"><a href="#cb1-804" aria-hidden="true" tabindex="-1"></a><span class="fu">dataloader_prefetch_factor</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-805"><a href="#cb1-805" aria-hidden="true" tabindex="-1"></a><span class="fu">dataloader_drop_last</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-806"><a href="#cb1-806" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-807"><a href="#cb1-807" aria-hidden="true" tabindex="-1"></a><span class="fu">remove_unused_columns</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-807"><a href="#cb1-807" aria-hidden="true" tabindex="-1"></a><span class="fu">accelerator_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
 <span id="cb1-808"><a href="#cb1-808" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-809"><a href="#cb1-809" aria-hidden="true" tabindex="-1"></a><span class="co"># Push prepared dataset to hub - repo_org/repo_name</span></span>
-<span id="cb1-810"><a href="#cb1-810" aria-hidden="true" tabindex="-1"></a><span class="fu">push_dataset_to_hub</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-811"><a href="#cb1-811" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private</span></span>
-<span id="cb1-812"><a href="#cb1-812" aria-hidden="true" tabindex="-1"></a><span class="co"># datasets. Required to be true when used in combination with `push_dataset_to_hub`</span></span>
-<span id="cb1-813"><a href="#cb1-813" aria-hidden="true" tabindex="-1"></a><span class="fu">hf_use_auth_token</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-814"><a href="#cb1-814" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-815"><a href="#cb1-815" aria-hidden="true" tabindex="-1"></a><span class="fu">device</span><span class="kw">:</span><span class="at"> Any | None</span></span>
-<span id="cb1-816"><a href="#cb1-816" aria-hidden="true" tabindex="-1"></a><span class="co"># Passed through to transformers when loading the model when launched without</span></span>
-<span id="cb1-817"><a href="#cb1-817" aria-hidden="true" tabindex="-1"></a><span class="co"># accelerate. Use `sequential` when training w/ model parallelism to limit memory</span></span>
-<span id="cb1-818"><a href="#cb1-818" aria-hidden="true" tabindex="-1"></a><span class="fu">device_map</span><span class="kw">:</span><span class="at"> Any | None</span></span>
-<span id="cb1-819"><a href="#cb1-819" aria-hidden="true" tabindex="-1"></a><span class="fu">world_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-820"><a href="#cb1-820" aria-hidden="true" tabindex="-1"></a><span class="co"># Don't mess with this, it's here for accelerate and torchrun</span></span>
-<span id="cb1-821"><a href="#cb1-821" aria-hidden="true" tabindex="-1"></a><span class="fu">local_rank</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-822"><a href="#cb1-822" aria-hidden="true" tabindex="-1"></a><span class="fu">ddp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-823"><a href="#cb1-823" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-824"><a href="#cb1-824" aria-hidden="true" tabindex="-1"></a><span class="co"># Seed for reproducibility</span></span>
-<span id="cb1-825"><a href="#cb1-825" aria-hidden="true" tabindex="-1"></a><span class="fu">seed</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-826"><a href="#cb1-826" aria-hidden="true" tabindex="-1"></a><span class="co"># Advanced DDP Arguments - timeout</span></span>
-<span id="cb1-827"><a href="#cb1-827" aria-hidden="true" tabindex="-1"></a><span class="fu">ddp_timeout</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-828"><a href="#cb1-828" aria-hidden="true" tabindex="-1"></a><span class="co"># Advanced DDP Arguments - bucket cap in MB</span></span>
-<span id="cb1-829"><a href="#cb1-829" aria-hidden="true" tabindex="-1"></a><span class="fu">ddp_bucket_cap_mb</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-830"><a href="#cb1-830" aria-hidden="true" tabindex="-1"></a><span class="co"># Advanced DDP Arguments - broadcast buffers</span></span>
-<span id="cb1-831"><a href="#cb1-831" aria-hidden="true" tabindex="-1"></a><span class="fu">ddp_broadcast_buffers</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-832"><a href="#cb1-832" aria-hidden="true" tabindex="-1"></a><span class="fu">ddp_find_unused_parameters</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-833"><a href="#cb1-833" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-834"><a href="#cb1-834" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to run causal language model evaluation for metrics in</span></span>
-<span id="cb1-835"><a href="#cb1-835" aria-hidden="true" tabindex="-1"></a><span class="co"># `eval_causal_lm_metrics`</span></span>
-<span id="cb1-836"><a href="#cb1-836" aria-hidden="true" tabindex="-1"></a><span class="fu">do_causal_lm_eval</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-837"><a href="#cb1-837" aria-hidden="true" tabindex="-1"></a><span class="co"># HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',</span></span>
-<span id="cb1-838"><a href="#cb1-838" aria-hidden="true" tabindex="-1"></a><span class="co"># 'chrf', 'perplexity']</span></span>
-<span id="cb1-839"><a href="#cb1-839" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_causal_lm_metrics</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-840"><a href="#cb1-840" aria-hidden="true" tabindex="-1"></a><span class="fu">do_bench_eval</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-841"><a href="#cb1-841" aria-hidden="true" tabindex="-1"></a><span class="fu">bench_dataset</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-842"><a href="#cb1-842" aria-hidden="true" tabindex="-1"></a><span class="fu">bench_split</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-843"><a href="#cb1-843" aria-hidden="true" tabindex="-1"></a><span class="fu">metric_for_best_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-844"><a href="#cb1-844" aria-hidden="true" tabindex="-1"></a><span class="fu">greater_is_better</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-845"><a href="#cb1-845" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-846"><a href="#cb1-846" aria-hidden="true" tabindex="-1"></a><span class="co"># High loss value, indicating the learning has broken down (a good estimate is ~2 times</span></span>
-<span id="cb1-847"><a href="#cb1-847" aria-hidden="true" tabindex="-1"></a><span class="co"># the loss at the start of training)</span></span>
-<span id="cb1-848"><a href="#cb1-848" aria-hidden="true" tabindex="-1"></a><span class="fu">loss_watchdog_threshold</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-849"><a href="#cb1-849" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of high-loss steps in a row before the trainer aborts (default: 3)</span></span>
-<span id="cb1-850"><a href="#cb1-850" aria-hidden="true" tabindex="-1"></a><span class="fu">loss_watchdog_patience</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-851"><a href="#cb1-851" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-852"><a href="#cb1-852" aria-hidden="true" tabindex="-1"></a><span class="co"># Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before</span></span>
-<span id="cb1-853"><a href="#cb1-853" aria-hidden="true" tabindex="-1"></a><span class="co"># evaluations. Default is 0 (disabled).</span></span>
-<span id="cb1-854"><a href="#cb1-854" aria-hidden="true" tabindex="-1"></a><span class="fu">gc_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-855"><a href="#cb1-855" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-856"><a href="#cb1-856" aria-hidden="true" tabindex="-1"></a><span class="co"># Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.</span></span>
-<span id="cb1-857"><a href="#cb1-857" aria-hidden="true" tabindex="-1"></a><span class="co"># require &gt;=ampere</span></span>
-<span id="cb1-858"><a href="#cb1-858" aria-hidden="true" tabindex="-1"></a><span class="fu">bf16</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None = auto</span></span>
-<span id="cb1-859"><a href="#cb1-859" aria-hidden="true" tabindex="-1"></a><span class="co"># Use CUDA fp16</span></span>
-<span id="cb1-860"><a href="#cb1-860" aria-hidden="true" tabindex="-1"></a><span class="fu">fp16</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-861"><a href="#cb1-861" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable FP8 mixed precision training using TorchAO. Best used in combination with</span></span>
-<span id="cb1-862"><a href="#cb1-862" aria-hidden="true" tabindex="-1"></a><span class="co"># torch.compile.</span></span>
-<span id="cb1-863"><a href="#cb1-863" aria-hidden="true" tabindex="-1"></a><span class="fu">fp8</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-864"><a href="#cb1-864" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable FSDP float8 all-gather optimization for FP8 training. Can improve training</span></span>
-<span id="cb1-865"><a href="#cb1-865" aria-hidden="true" tabindex="-1"></a><span class="co"># speed by 10-15% when FSDP is enabled.</span></span>
-<span id="cb1-866"><a href="#cb1-866" aria-hidden="true" tabindex="-1"></a><span class="fu">fp8_enable_fsdp_float8_all_gather</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-867"><a href="#cb1-867" aria-hidden="true" tabindex="-1"></a><span class="co"># No AMP (automatic mixed precision) - require &gt;=ampere</span></span>
-<span id="cb1-868"><a href="#cb1-868" aria-hidden="true" tabindex="-1"></a><span class="fu">bfloat16</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-869"><a href="#cb1-869" aria-hidden="true" tabindex="-1"></a><span class="co"># No AMP (automatic mixed precision)</span></span>
-<span id="cb1-870"><a href="#cb1-870" aria-hidden="true" tabindex="-1"></a><span class="fu">float16</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-871"><a href="#cb1-871" aria-hidden="true" tabindex="-1"></a><span class="co"># bool to use CUDA tf32 or 'auto' for automatic detection - require &gt;=ampere</span></span>
-<span id="cb1-872"><a href="#cb1-872" aria-hidden="true" tabindex="-1"></a><span class="fu">tf32</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None = auto</span></span>
-<span id="cb1-873"><a href="#cb1-873" aria-hidden="true" tabindex="-1"></a><span class="fu">float32</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-874"><a href="#cb1-874" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-875"><a href="#cb1-875" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use gradient checkpointing. Available options are: true, false, 'offload',</span></span>
-<span id="cb1-876"><a href="#cb1-876" aria-hidden="true" tabindex="-1"></a><span class="co"># 'offload_disk'.</span></span>
-<span id="cb1-877"><a href="#cb1-877" aria-hidden="true" tabindex="-1"></a><span class="co"># https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing</span></span>
-<span id="cb1-878"><a href="#cb1-878" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing</span><span class="kw">:</span><span class="at"> Literal['offload', 'offload_disk'] | bool | None = False</span></span>
-<span id="cb1-879"><a href="#cb1-879" aria-hidden="true" tabindex="-1"></a><span class="co"># Additional kwargs to pass to the trainer for gradient checkpointing</span></span>
-<span id="cb1-880"><a href="#cb1-880" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-881"><a href="#cb1-881" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.</span></span>
-<span id="cb1-882"><a href="#cb1-882" aria-hidden="true" tabindex="-1"></a><span class="fu">activation_offloading</span><span class="kw">:</span><span class="at"> Literal['legacy', 'disk'] | bool | None = False</span></span>
-<span id="cb1-883"><a href="#cb1-883" aria-hidden="true" tabindex="-1"></a><span class="co"># Offload model layer parameters to CPU during forward, prefetch back during backward.</span></span>
-<span id="cb1-884"><a href="#cb1-884" aria-hidden="true" tabindex="-1"></a><span class="fu">layer_offloading</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-885"><a href="#cb1-885" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-886"><a href="#cb1-886" aria-hidden="true" tabindex="-1"></a><span class="co"># List of regex patterns for parameter names to keep unfrozen. All other parameters will</span></span>
-<span id="cb1-887"><a href="#cb1-887" aria-hidden="true" tabindex="-1"></a><span class="co"># be frozen via requires_grad=False. Note: range-based patterns (e.g.</span></span>
-<span id="cb1-888"><a href="#cb1-888" aria-hidden="true" tabindex="-1"></a><span class="co"># embed_tokens.weight$[:32000]) use gradient zeroing rather than a true freeze, so</span></span>
-<span id="cb1-889"><a href="#cb1-889" aria-hidden="true" tabindex="-1"></a><span class="co"># weight decay will still apply to the frozen portion and optimizer states are allocated</span></span>
-<span id="cb1-890"><a href="#cb1-890" aria-hidden="true" tabindex="-1"></a><span class="co"># for the full parameter.</span></span>
-<span id="cb1-891"><a href="#cb1-891" aria-hidden="true" tabindex="-1"></a><span class="fu">unfrozen_parameters</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-892"><a href="#cb1-892" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-893"><a href="#cb1-893" aria-hidden="true" tabindex="-1"></a><span class="co"># The maximum length of an input to train with, this should typically be less than 2048</span></span>
-<span id="cb1-894"><a href="#cb1-894" aria-hidden="true" tabindex="-1"></a><span class="co"># as most models have a token/context limit of 2048</span></span>
-<span id="cb1-895"><a href="#cb1-895" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_len</span><span class="kw">:</span><span class="at"> int = 512</span></span>
-<span id="cb1-896"><a href="#cb1-896" aria-hidden="true" tabindex="-1"></a><span class="co"># What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;</span></span>
-<span id="cb1-897"><a href="#cb1-897" aria-hidden="true" tabindex="-1"></a><span class="co"># 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to</span></span>
-<span id="cb1-898"><a href="#cb1-898" aria-hidden="true" tabindex="-1"></a><span class="co"># 'drop' for backward compatibility.</span></span>
-<span id="cb1-899"><a href="#cb1-899" aria-hidden="true" tabindex="-1"></a><span class="fu">excess_length_strategy</span><span class="kw">:</span><span class="at"> Literal['drop', 'truncate', 'raise'] | None</span></span>
-<span id="cb1-900"><a href="#cb1-900" aria-hidden="true" tabindex="-1"></a><span class="co"># The maximum length of an input for evaluation. If not specified, defaults to</span></span>
-<span id="cb1-901"><a href="#cb1-901" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence_len</span></span>
-<span id="cb1-902"><a href="#cb1-902" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_sequence_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-903"><a href="#cb1-903" aria-hidden="true" tabindex="-1"></a><span class="fu">min_sample_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-904"><a href="#cb1-904" aria-hidden="true" tabindex="-1"></a><span class="co"># maximum prompt length for RL training</span></span>
-<span id="cb1-905"><a href="#cb1-905" aria-hidden="true" tabindex="-1"></a><span class="fu">max_prompt_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-906"><a href="#cb1-906" aria-hidden="true" tabindex="-1"></a><span class="co"># Use efficient multi-packing with block diagonal attention and per sequence</span></span>
-<span id="cb1-907"><a href="#cb1-907" aria-hidden="true" tabindex="-1"></a><span class="co"># position_ids. Recommend set to 'true'</span></span>
-<span id="cb1-908"><a href="#cb1-908" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-909"><a href="#cb1-909" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples packed at a time. Increasing the following values helps with</span></span>
-<span id="cb1-910"><a href="#cb1-910" aria-hidden="true" tabindex="-1"></a><span class="co"># packing, but usually only slightly (&lt;%1.)</span></span>
-<span id="cb1-911"><a href="#cb1-911" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_group_size</span><span class="kw">:</span><span class="at"> int | None = 100000</span></span>
-<span id="cb1-912"><a href="#cb1-912" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples which can be packed into one sequence. Increase if using a large</span></span>
-<span id="cb1-913"><a href="#cb1-913" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence_len with many short samples.</span></span>
-<span id="cb1-914"><a href="#cb1-914" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_bin_size</span><span class="kw">:</span><span class="at"> int | None = 200</span></span>
-<span id="cb1-915"><a href="#cb1-915" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to pack samples sequentially</span></span>
-<span id="cb1-916"><a href="#cb1-916" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_sequentially</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-917"><a href="#cb1-917" aria-hidden="true" tabindex="-1"></a><span class="co"># The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or</span></span>
-<span id="cb1-918"><a href="#cb1-918" aria-hidden="true" tabindex="-1"></a><span class="co"># 'forkserver'</span></span>
-<span id="cb1-919"><a href="#cb1-919" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_mp_start_method</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-920"><a href="#cb1-920" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to 'false' if getting errors during eval with sample_packing on</span></span>
-<span id="cb1-921"><a href="#cb1-921" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_sample_packing</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-922"><a href="#cb1-922" aria-hidden="true" tabindex="-1"></a><span class="co"># Pad inputs so each step uses constant sized buffers. This will reduce memory</span></span>
-<span id="cb1-923"><a href="#cb1-923" aria-hidden="true" tabindex="-1"></a><span class="co"># fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to</span></span>
-<span id="cb1-924"><a href="#cb1-924" aria-hidden="true" tabindex="-1"></a><span class="co"># True if `sample_packing` enabled</span></span>
-<span id="cb1-925"><a href="#cb1-925" aria-hidden="true" tabindex="-1"></a><span class="fu">pad_to_sequence_len</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-926"><a href="#cb1-926" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use sequential sampling for curriculum learning</span></span>
-<span id="cb1-927"><a href="#cb1-927" aria-hidden="true" tabindex="-1"></a><span class="fu">curriculum_sampling</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-928"><a href="#cb1-928" aria-hidden="true" tabindex="-1"></a><span class="fu">multipack_real_batches</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-929"><a href="#cb1-929" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-930"><a href="#cb1-930" aria-hidden="true" tabindex="-1"></a><span class="co"># Use batch flattening for speedups when not using sample_packing</span></span>
-<span id="cb1-931"><a href="#cb1-931" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_flattening</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
-<span id="cb1-932"><a href="#cb1-932" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-933"><a href="#cb1-933" aria-hidden="true" tabindex="-1"></a><span class="fu">use_pose</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-934"><a href="#cb1-934" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_split_on_token_ids</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
-<span id="cb1-935"><a href="#cb1-935" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_max_context_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-936"><a href="#cb1-936" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-937"><a href="#cb1-937" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-938"><a href="#cb1-938" aria-hidden="true" tabindex="-1"></a><span class="fu">pretrain_multipack_buffer_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-939"><a href="#cb1-939" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to prevent cross attention for packed sequences during pretraining</span></span>
-<span id="cb1-940"><a href="#cb1-940" aria-hidden="true" tabindex="-1"></a><span class="fu">pretrain_multipack_attn</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-941"><a href="#cb1-941" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to concatenate samples during pretraining</span></span>
-<span id="cb1-942"><a href="#cb1-942" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_sample_concatenation</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-943"><a href="#cb1-943" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-944"><a href="#cb1-944" aria-hidden="true" tabindex="-1"></a><span class="co"># Use streaming mode for loading datasets</span></span>
-<span id="cb1-945"><a href="#cb1-945" aria-hidden="true" tabindex="-1"></a><span class="fu">streaming</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-946"><a href="#cb1-946" aria-hidden="true" tabindex="-1"></a><span class="co"># Buffer size for multipack streaming datasets</span></span>
-<span id="cb1-947"><a href="#cb1-947" aria-hidden="true" tabindex="-1"></a><span class="fu">streaming_multipack_buffer_size</span><span class="kw">:</span><span class="at"> int | None = 10000</span></span>
-<span id="cb1-948"><a href="#cb1-948" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-949"><a href="#cb1-949" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use xformers attention patch https://github.com/facebookresearch/xformers</span></span>
-<span id="cb1-950"><a href="#cb1-950" aria-hidden="true" tabindex="-1"></a><span class="fu">xformers_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-951"><a href="#cb1-951" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/</span></span>
-<span id="cb1-952"><a href="#cb1-952" aria-hidden="true" tabindex="-1"></a><span class="co"># torch.nn.functional.scaled_dot_product_attention.html</span></span>
-<span id="cb1-953"><a href="#cb1-953" aria-hidden="true" tabindex="-1"></a><span class="fu">sdp_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-954"><a href="#cb1-954" aria-hidden="true" tabindex="-1"></a><span class="co"># Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf</span></span>
-<span id="cb1-955"><a href="#cb1-955" aria-hidden="true" tabindex="-1"></a><span class="fu">s2_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-956"><a href="#cb1-956" aria-hidden="true" tabindex="-1"></a><span class="fu">flex_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-957"><a href="#cb1-957" aria-hidden="true" tabindex="-1"></a><span class="fu">flex_attn_compile_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-958"><a href="#cb1-958" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention</span></span>
-<span id="cb1-959"><a href="#cb1-959" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-960"><a href="#cb1-960" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash-attention cross entropy implementation - advanced use only</span></span>
-<span id="cb1-961"><a href="#cb1-961" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-962"><a href="#cb1-962" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash-attention rms norm implementation - advanced use only</span></span>
-<span id="cb1-963"><a href="#cb1-963" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_rms_norm</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-964"><a href="#cb1-964" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to fuse part of the MLP into a single operation</span></span>
-<span id="cb1-965"><a href="#cb1-965" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_fuse_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-966"><a href="#cb1-966" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use bettertransformers</span></span>
-<span id="cb1-967"><a href="#cb1-967" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_optimum</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-968"><a href="#cb1-968" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use SageAttention https://github.com/thu-ml/SageAttention</span></span>
-<span id="cb1-969"><a href="#cb1-969" aria-hidden="true" tabindex="-1"></a><span class="fu">sage_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-970"><a href="#cb1-970" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-971"><a href="#cb1-971" aria-hidden="true" tabindex="-1"></a><span class="fu">eager_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-809"><a href="#cb1-809" aria-hidden="true" tabindex="-1"></a><span class="fu">remove_unused_columns</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-810"><a href="#cb1-810" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-811"><a href="#cb1-811" aria-hidden="true" tabindex="-1"></a><span class="co"># Push prepared dataset to hub - repo_org/repo_name</span></span>
+<span id="cb1-812"><a href="#cb1-812" aria-hidden="true" tabindex="-1"></a><span class="fu">push_dataset_to_hub</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-813"><a href="#cb1-813" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private</span></span>
+<span id="cb1-814"><a href="#cb1-814" aria-hidden="true" tabindex="-1"></a><span class="co"># datasets. Required to be true when used in combination with `push_dataset_to_hub`</span></span>
+<span id="cb1-815"><a href="#cb1-815" aria-hidden="true" tabindex="-1"></a><span class="fu">hf_use_auth_token</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-816"><a href="#cb1-816" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-817"><a href="#cb1-817" aria-hidden="true" tabindex="-1"></a><span class="fu">device</span><span class="kw">:</span><span class="at"> Any | None</span></span>
+<span id="cb1-818"><a href="#cb1-818" aria-hidden="true" tabindex="-1"></a><span class="co"># Passed through to transformers when loading the model when launched without</span></span>
+<span id="cb1-819"><a href="#cb1-819" aria-hidden="true" tabindex="-1"></a><span class="co"># accelerate. Use `sequential` when training w/ model parallelism to limit memory</span></span>
+<span id="cb1-820"><a href="#cb1-820" aria-hidden="true" tabindex="-1"></a><span class="fu">device_map</span><span class="kw">:</span><span class="at"> Any | None</span></span>
+<span id="cb1-821"><a href="#cb1-821" aria-hidden="true" tabindex="-1"></a><span class="fu">world_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-822"><a href="#cb1-822" aria-hidden="true" tabindex="-1"></a><span class="co"># Don't mess with this, it's here for accelerate and torchrun</span></span>
+<span id="cb1-823"><a href="#cb1-823" aria-hidden="true" tabindex="-1"></a><span class="fu">local_rank</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-824"><a href="#cb1-824" aria-hidden="true" tabindex="-1"></a><span class="fu">ddp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-825"><a href="#cb1-825" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-826"><a href="#cb1-826" aria-hidden="true" tabindex="-1"></a><span class="co"># Seed for reproducibility</span></span>
+<span id="cb1-827"><a href="#cb1-827" aria-hidden="true" tabindex="-1"></a><span class="fu">seed</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-828"><a href="#cb1-828" aria-hidden="true" tabindex="-1"></a><span class="co"># Advanced DDP Arguments - timeout</span></span>
+<span id="cb1-829"><a href="#cb1-829" aria-hidden="true" tabindex="-1"></a><span class="fu">ddp_timeout</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-830"><a href="#cb1-830" aria-hidden="true" tabindex="-1"></a><span class="co"># Advanced DDP Arguments - bucket cap in MB</span></span>
+<span id="cb1-831"><a href="#cb1-831" aria-hidden="true" tabindex="-1"></a><span class="fu">ddp_bucket_cap_mb</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-832"><a href="#cb1-832" aria-hidden="true" tabindex="-1"></a><span class="co"># Advanced DDP Arguments - broadcast buffers</span></span>
+<span id="cb1-833"><a href="#cb1-833" aria-hidden="true" tabindex="-1"></a><span class="fu">ddp_broadcast_buffers</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-834"><a href="#cb1-834" aria-hidden="true" tabindex="-1"></a><span class="fu">ddp_find_unused_parameters</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-835"><a href="#cb1-835" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-836"><a href="#cb1-836" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to run causal language model evaluation for metrics in</span></span>
+<span id="cb1-837"><a href="#cb1-837" aria-hidden="true" tabindex="-1"></a><span class="co"># `eval_causal_lm_metrics`</span></span>
+<span id="cb1-838"><a href="#cb1-838" aria-hidden="true" tabindex="-1"></a><span class="fu">do_causal_lm_eval</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-839"><a href="#cb1-839" aria-hidden="true" tabindex="-1"></a><span class="co"># HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',</span></span>
+<span id="cb1-840"><a href="#cb1-840" aria-hidden="true" tabindex="-1"></a><span class="co"># 'chrf', 'perplexity']</span></span>
+<span id="cb1-841"><a href="#cb1-841" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_causal_lm_metrics</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-842"><a href="#cb1-842" aria-hidden="true" tabindex="-1"></a><span class="fu">do_bench_eval</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-843"><a href="#cb1-843" aria-hidden="true" tabindex="-1"></a><span class="fu">bench_dataset</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-844"><a href="#cb1-844" aria-hidden="true" tabindex="-1"></a><span class="fu">bench_split</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-845"><a href="#cb1-845" aria-hidden="true" tabindex="-1"></a><span class="fu">metric_for_best_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-846"><a href="#cb1-846" aria-hidden="true" tabindex="-1"></a><span class="fu">greater_is_better</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-847"><a href="#cb1-847" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-848"><a href="#cb1-848" aria-hidden="true" tabindex="-1"></a><span class="co"># High loss value, indicating the learning has broken down (a good estimate is ~2 times</span></span>
+<span id="cb1-849"><a href="#cb1-849" aria-hidden="true" tabindex="-1"></a><span class="co"># the loss at the start of training)</span></span>
+<span id="cb1-850"><a href="#cb1-850" aria-hidden="true" tabindex="-1"></a><span class="fu">loss_watchdog_threshold</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-851"><a href="#cb1-851" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of high-loss steps in a row before the trainer aborts (default: 3)</span></span>
+<span id="cb1-852"><a href="#cb1-852" aria-hidden="true" tabindex="-1"></a><span class="fu">loss_watchdog_patience</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-853"><a href="#cb1-853" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-854"><a href="#cb1-854" aria-hidden="true" tabindex="-1"></a><span class="co"># Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before</span></span>
+<span id="cb1-855"><a href="#cb1-855" aria-hidden="true" tabindex="-1"></a><span class="co"># evaluations. Default is 0 (disabled).</span></span>
+<span id="cb1-856"><a href="#cb1-856" aria-hidden="true" tabindex="-1"></a><span class="fu">gc_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-857"><a href="#cb1-857" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-858"><a href="#cb1-858" aria-hidden="true" tabindex="-1"></a><span class="co"># Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.</span></span>
+<span id="cb1-859"><a href="#cb1-859" aria-hidden="true" tabindex="-1"></a><span class="co"># require &gt;=ampere</span></span>
+<span id="cb1-860"><a href="#cb1-860" aria-hidden="true" tabindex="-1"></a><span class="fu">bf16</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None = auto</span></span>
+<span id="cb1-861"><a href="#cb1-861" aria-hidden="true" tabindex="-1"></a><span class="co"># Use CUDA fp16</span></span>
+<span id="cb1-862"><a href="#cb1-862" aria-hidden="true" tabindex="-1"></a><span class="fu">fp16</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-863"><a href="#cb1-863" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable FP8 mixed precision training using TorchAO. Best used in combination with</span></span>
+<span id="cb1-864"><a href="#cb1-864" aria-hidden="true" tabindex="-1"></a><span class="co"># torch.compile.</span></span>
+<span id="cb1-865"><a href="#cb1-865" aria-hidden="true" tabindex="-1"></a><span class="fu">fp8</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-866"><a href="#cb1-866" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable FSDP float8 all-gather optimization for FP8 training. Can improve training</span></span>
+<span id="cb1-867"><a href="#cb1-867" aria-hidden="true" tabindex="-1"></a><span class="co"># speed by 10-15% when FSDP is enabled.</span></span>
+<span id="cb1-868"><a href="#cb1-868" aria-hidden="true" tabindex="-1"></a><span class="fu">fp8_enable_fsdp_float8_all_gather</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-869"><a href="#cb1-869" aria-hidden="true" tabindex="-1"></a><span class="co"># No AMP (automatic mixed precision) - require &gt;=ampere</span></span>
+<span id="cb1-870"><a href="#cb1-870" aria-hidden="true" tabindex="-1"></a><span class="fu">bfloat16</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-871"><a href="#cb1-871" aria-hidden="true" tabindex="-1"></a><span class="co"># No AMP (automatic mixed precision)</span></span>
+<span id="cb1-872"><a href="#cb1-872" aria-hidden="true" tabindex="-1"></a><span class="fu">float16</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-873"><a href="#cb1-873" aria-hidden="true" tabindex="-1"></a><span class="co"># bool to use CUDA tf32 or 'auto' for automatic detection - require &gt;=ampere</span></span>
+<span id="cb1-874"><a href="#cb1-874" aria-hidden="true" tabindex="-1"></a><span class="fu">tf32</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None = auto</span></span>
+<span id="cb1-875"><a href="#cb1-875" aria-hidden="true" tabindex="-1"></a><span class="fu">float32</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-876"><a href="#cb1-876" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-877"><a href="#cb1-877" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use gradient checkpointing. Available options are: true, false, 'offload',</span></span>
+<span id="cb1-878"><a href="#cb1-878" aria-hidden="true" tabindex="-1"></a><span class="co"># 'offload_disk'.</span></span>
+<span id="cb1-879"><a href="#cb1-879" aria-hidden="true" tabindex="-1"></a><span class="co"># https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing</span></span>
+<span id="cb1-880"><a href="#cb1-880" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing</span><span class="kw">:</span><span class="at"> Literal['offload', 'offload_disk'] | bool | None = False</span></span>
+<span id="cb1-881"><a href="#cb1-881" aria-hidden="true" tabindex="-1"></a><span class="co"># Additional kwargs to pass to the trainer for gradient checkpointing</span></span>
+<span id="cb1-882"><a href="#cb1-882" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-883"><a href="#cb1-883" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.</span></span>
+<span id="cb1-884"><a href="#cb1-884" aria-hidden="true" tabindex="-1"></a><span class="fu">activation_offloading</span><span class="kw">:</span><span class="at"> Literal['legacy', 'disk'] | bool | None = False</span></span>
+<span id="cb1-885"><a href="#cb1-885" aria-hidden="true" tabindex="-1"></a><span class="co"># Offload model layer parameters to CPU during forward, prefetch back during backward.</span></span>
+<span id="cb1-886"><a href="#cb1-886" aria-hidden="true" tabindex="-1"></a><span class="fu">layer_offloading</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-887"><a href="#cb1-887" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-888"><a href="#cb1-888" aria-hidden="true" tabindex="-1"></a><span class="co"># List of regex patterns for parameter names to keep unfrozen. All other parameters will</span></span>
+<span id="cb1-889"><a href="#cb1-889" aria-hidden="true" tabindex="-1"></a><span class="co"># be frozen via requires_grad=False. Note: range-based patterns (e.g.</span></span>
+<span id="cb1-890"><a href="#cb1-890" aria-hidden="true" tabindex="-1"></a><span class="co"># embed_tokens.weight$[:32000]) use gradient zeroing rather than a true freeze, so</span></span>
+<span id="cb1-891"><a href="#cb1-891" aria-hidden="true" tabindex="-1"></a><span class="co"># weight decay will still apply to the frozen portion and optimizer states are allocated</span></span>
+<span id="cb1-892"><a href="#cb1-892" aria-hidden="true" tabindex="-1"></a><span class="co"># for the full parameter.</span></span>
+<span id="cb1-893"><a href="#cb1-893" aria-hidden="true" tabindex="-1"></a><span class="fu">unfrozen_parameters</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-894"><a href="#cb1-894" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-895"><a href="#cb1-895" aria-hidden="true" tabindex="-1"></a><span class="co"># The maximum length of an input to train with, this should typically be less than 2048</span></span>
+<span id="cb1-896"><a href="#cb1-896" aria-hidden="true" tabindex="-1"></a><span class="co"># as most models have a token/context limit of 2048</span></span>
+<span id="cb1-897"><a href="#cb1-897" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_len</span><span class="kw">:</span><span class="at"> int = 512</span></span>
+<span id="cb1-898"><a href="#cb1-898" aria-hidden="true" tabindex="-1"></a><span class="co"># What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;</span></span>
+<span id="cb1-899"><a href="#cb1-899" aria-hidden="true" tabindex="-1"></a><span class="co"># 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to</span></span>
+<span id="cb1-900"><a href="#cb1-900" aria-hidden="true" tabindex="-1"></a><span class="co"># 'drop' for backward compatibility.</span></span>
+<span id="cb1-901"><a href="#cb1-901" aria-hidden="true" tabindex="-1"></a><span class="fu">excess_length_strategy</span><span class="kw">:</span><span class="at"> Literal['drop', 'truncate', 'raise'] | None</span></span>
+<span id="cb1-902"><a href="#cb1-902" aria-hidden="true" tabindex="-1"></a><span class="co"># The maximum length of an input for evaluation. If not specified, defaults to</span></span>
+<span id="cb1-903"><a href="#cb1-903" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence_len</span></span>
+<span id="cb1-904"><a href="#cb1-904" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_sequence_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-905"><a href="#cb1-905" aria-hidden="true" tabindex="-1"></a><span class="fu">min_sample_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-906"><a href="#cb1-906" aria-hidden="true" tabindex="-1"></a><span class="co"># maximum prompt length for RL training</span></span>
+<span id="cb1-907"><a href="#cb1-907" aria-hidden="true" tabindex="-1"></a><span class="fu">max_prompt_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-908"><a href="#cb1-908" aria-hidden="true" tabindex="-1"></a><span class="co"># Use efficient multi-packing with block diagonal attention and per sequence</span></span>
+<span id="cb1-909"><a href="#cb1-909" aria-hidden="true" tabindex="-1"></a><span class="co"># position_ids. Recommend set to 'true'</span></span>
+<span id="cb1-910"><a href="#cb1-910" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-911"><a href="#cb1-911" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples packed at a time. Increasing the following values helps with</span></span>
+<span id="cb1-912"><a href="#cb1-912" aria-hidden="true" tabindex="-1"></a><span class="co"># packing, but usually only slightly (&lt;%1.)</span></span>
+<span id="cb1-913"><a href="#cb1-913" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_group_size</span><span class="kw">:</span><span class="at"> int | None = 100000</span></span>
+<span id="cb1-914"><a href="#cb1-914" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples which can be packed into one sequence. Increase if using a large</span></span>
+<span id="cb1-915"><a href="#cb1-915" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence_len with many short samples.</span></span>
+<span id="cb1-916"><a href="#cb1-916" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_bin_size</span><span class="kw">:</span><span class="at"> int | None = 200</span></span>
+<span id="cb1-917"><a href="#cb1-917" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to pack samples sequentially</span></span>
+<span id="cb1-918"><a href="#cb1-918" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_sequentially</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-919"><a href="#cb1-919" aria-hidden="true" tabindex="-1"></a><span class="co"># The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or</span></span>
+<span id="cb1-920"><a href="#cb1-920" aria-hidden="true" tabindex="-1"></a><span class="co"># 'forkserver'</span></span>
+<span id="cb1-921"><a href="#cb1-921" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_mp_start_method</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-922"><a href="#cb1-922" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to 'false' if getting errors during eval with sample_packing on</span></span>
+<span id="cb1-923"><a href="#cb1-923" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_sample_packing</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-924"><a href="#cb1-924" aria-hidden="true" tabindex="-1"></a><span class="co"># Pad inputs so each step uses constant sized buffers. This will reduce memory</span></span>
+<span id="cb1-925"><a href="#cb1-925" aria-hidden="true" tabindex="-1"></a><span class="co"># fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to</span></span>
+<span id="cb1-926"><a href="#cb1-926" aria-hidden="true" tabindex="-1"></a><span class="co"># True if `sample_packing` enabled</span></span>
+<span id="cb1-927"><a href="#cb1-927" aria-hidden="true" tabindex="-1"></a><span class="fu">pad_to_sequence_len</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-928"><a href="#cb1-928" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use sequential sampling for curriculum learning</span></span>
+<span id="cb1-929"><a href="#cb1-929" aria-hidden="true" tabindex="-1"></a><span class="fu">curriculum_sampling</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-930"><a href="#cb1-930" aria-hidden="true" tabindex="-1"></a><span class="fu">multipack_real_batches</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-931"><a href="#cb1-931" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-932"><a href="#cb1-932" aria-hidden="true" tabindex="-1"></a><span class="co"># Use batch flattening for speedups when not using sample_packing</span></span>
+<span id="cb1-933"><a href="#cb1-933" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_flattening</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
+<span id="cb1-934"><a href="#cb1-934" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-935"><a href="#cb1-935" aria-hidden="true" tabindex="-1"></a><span class="fu">use_pose</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-936"><a href="#cb1-936" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_split_on_token_ids</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
+<span id="cb1-937"><a href="#cb1-937" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_max_context_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-938"><a href="#cb1-938" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-939"><a href="#cb1-939" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-940"><a href="#cb1-940" aria-hidden="true" tabindex="-1"></a><span class="fu">pretrain_multipack_buffer_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-941"><a href="#cb1-941" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to prevent cross attention for packed sequences during pretraining</span></span>
+<span id="cb1-942"><a href="#cb1-942" aria-hidden="true" tabindex="-1"></a><span class="fu">pretrain_multipack_attn</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-943"><a href="#cb1-943" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to concatenate samples during pretraining</span></span>
+<span id="cb1-944"><a href="#cb1-944" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_sample_concatenation</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-945"><a href="#cb1-945" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-946"><a href="#cb1-946" aria-hidden="true" tabindex="-1"></a><span class="co"># Use streaming mode for loading datasets</span></span>
+<span id="cb1-947"><a href="#cb1-947" aria-hidden="true" tabindex="-1"></a><span class="fu">streaming</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-948"><a href="#cb1-948" aria-hidden="true" tabindex="-1"></a><span class="co"># Buffer size for multipack streaming datasets</span></span>
+<span id="cb1-949"><a href="#cb1-949" aria-hidden="true" tabindex="-1"></a><span class="fu">streaming_multipack_buffer_size</span><span class="kw">:</span><span class="at"> int | None = 10000</span></span>
+<span id="cb1-950"><a href="#cb1-950" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-951"><a href="#cb1-951" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use xformers attention patch https://github.com/facebookresearch/xformers</span></span>
+<span id="cb1-952"><a href="#cb1-952" aria-hidden="true" tabindex="-1"></a><span class="fu">xformers_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-953"><a href="#cb1-953" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/</span></span>
+<span id="cb1-954"><a href="#cb1-954" aria-hidden="true" tabindex="-1"></a><span class="co"># torch.nn.functional.scaled_dot_product_attention.html</span></span>
+<span id="cb1-955"><a href="#cb1-955" aria-hidden="true" tabindex="-1"></a><span class="fu">sdp_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-956"><a href="#cb1-956" aria-hidden="true" tabindex="-1"></a><span class="co"># Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf</span></span>
+<span id="cb1-957"><a href="#cb1-957" aria-hidden="true" tabindex="-1"></a><span class="fu">s2_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-958"><a href="#cb1-958" aria-hidden="true" tabindex="-1"></a><span class="fu">flex_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-959"><a href="#cb1-959" aria-hidden="true" tabindex="-1"></a><span class="fu">flex_attn_compile_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-960"><a href="#cb1-960" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention</span></span>
+<span id="cb1-961"><a href="#cb1-961" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-962"><a href="#cb1-962" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash-attention cross entropy implementation - advanced use only</span></span>
+<span id="cb1-963"><a href="#cb1-963" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-964"><a href="#cb1-964" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash-attention rms norm implementation - advanced use only</span></span>
+<span id="cb1-965"><a href="#cb1-965" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_rms_norm</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-966"><a href="#cb1-966" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to fuse part of the MLP into a single operation</span></span>
+<span id="cb1-967"><a href="#cb1-967" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_fuse_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-968"><a href="#cb1-968" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use bettertransformers</span></span>
+<span id="cb1-969"><a href="#cb1-969" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_optimum</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-970"><a href="#cb1-970" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use SageAttention https://github.com/thu-ml/SageAttention</span></span>
+<span id="cb1-971"><a href="#cb1-971" aria-hidden="true" tabindex="-1"></a><span class="fu">sage_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-972"><a href="#cb1-972" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-973"><a href="#cb1-973" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a custom attention implementation, used mostly for kernels.</span></span>
-<span id="cb1-974"><a href="#cb1-974" aria-hidden="true" tabindex="-1"></a><span class="fu">attn_implementation</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-975"><a href="#cb1-975" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-976"><a href="#cb1-976" aria-hidden="true" tabindex="-1"></a><span class="co"># Which experts implementation to use for MoE models,</span></span>
-<span id="cb1-977"><a href="#cb1-977" aria-hidden="true" tabindex="-1"></a><span class="fu">experts_implementation</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-978"><a href="#cb1-978" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-979"><a href="#cb1-979" aria-hidden="true" tabindex="-1"></a><span class="co"># Quantize MoE expert weights on load to reduce VRAM. Requires adapter (lora/qlora) with</span></span>
-<span id="cb1-980"><a href="#cb1-980" aria-hidden="true" tabindex="-1"></a><span class="co"># load_in_4bit or load_in_8bit. Requires CUDA (not compatible with ROCm or other</span></span>
-<span id="cb1-981"><a href="#cb1-981" aria-hidden="true" tabindex="-1"></a><span class="co"># backends). Note: total parameter count may be reported incorrectly when enabled</span></span>
-<span id="cb1-982"><a href="#cb1-982" aria-hidden="true" tabindex="-1"></a><span class="co"># (trainable param count is correct).</span></span>
-<span id="cb1-983"><a href="#cb1-983" aria-hidden="true" tabindex="-1"></a><span class="fu">quantize_moe_experts</span><span class="kw">:</span><span class="at"> bool = False</span></span>
-<span id="cb1-984"><a href="#cb1-984" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-985"><a href="#cb1-985" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399</span></span>
-<span id="cb1-986"><a href="#cb1-986" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-987"><a href="#cb1-987" aria-hidden="true" tabindex="-1"></a><span class="co"># Scaling factor for SSMax attention. Default is 0.43</span></span>
-<span id="cb1-988"><a href="#cb1-988" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-989"><a href="#cb1-989" aria-hidden="true" tabindex="-1"></a><span class="co"># Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better</span></span>
-<span id="cb1-990"><a href="#cb1-990" aria-hidden="true" tabindex="-1"></a><span class="co"># length generalization.</span></span>
-<span id="cb1-991"><a href="#cb1-991" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax_bias</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-992"><a href="#cb1-992" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-993"><a href="#cb1-993" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_cross_entropy_loss</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-994"><a href="#cb1-994" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-995"><a href="#cb1-995" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_qkv</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-996"><a href="#cb1-996" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_o</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-997"><a href="#cb1-997" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rms_norm</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-998"><a href="#cb1-998" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rope</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-999"><a href="#cb1-999" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1000"><a href="#cb1-1000" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-1001"><a href="#cb1-1001" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-1002"><a href="#cb1-1002" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_mlp_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1003"><a href="#cb1-1003" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-1004"><a href="#cb1-1004" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-1005"><a href="#cb1-1005" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_qkv_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1006"><a href="#cb1-1006" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-1007"><a href="#cb1-1007" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-1008"><a href="#cb1-1008" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_o_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1009"><a href="#cb1-1009" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd function for embedding layers. See:</span></span>
-<span id="cb1-1010"><a href="#cb1-1010" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-1011"><a href="#cb1-1011" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_embedding_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1012"><a href="#cb1-1012" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1013"><a href="#cb1-1013" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use chunked cross entropy loss for memory efficiency</span></span>
-<span id="cb1-1014"><a href="#cb1-1014" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1015"><a href="#cb1-1015" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of chunks to use for chunked cross entropy loss</span></span>
-<span id="cb1-1016"><a href="#cb1-1016" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1017"><a href="#cb1-1017" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable Entropy-Aware Focal Training loss (EAFT)</span></span>
-<span id="cb1-1018"><a href="#cb1-1018" aria-hidden="true" tabindex="-1"></a><span class="fu">use_eaft</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1019"><a href="#cb1-1019" aria-hidden="true" tabindex="-1"></a><span class="co"># Exponent for entropy weighting in EAFT (default: 1.0)</span></span>
-<span id="cb1-1020"><a href="#cb1-1020" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_alpha</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
-<span id="cb1-1021"><a href="#cb1-1021" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of top logits for entropy approximation (default: 20)</span></span>
-<span id="cb1-1022"><a href="#cb1-1022" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_k</span><span class="kw">:</span><span class="at"> int | None = 20</span></span>
-<span id="cb1-1023"><a href="#cb1-1023" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1024"><a href="#cb1-1024" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ALST tiled mlp for memory efficient long context</span></span>
-<span id="cb1-1025"><a href="#cb1-1025" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1026"><a href="#cb1-1026" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1027"><a href="#cb1-1027" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of shards to use for ALST tiled mlp. If unset, it will be set based on</span></span>
-<span id="cb1-1028"><a href="#cb1-1028" aria-hidden="true" tabindex="-1"></a><span class="co"># seqlen/hidden_size</span></span>
-<span id="cb1-1029"><a href="#cb1-1029" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_num_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1030"><a href="#cb1-1030" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1031"><a href="#cb1-1031" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on</span></span>
-<span id="cb1-1032"><a href="#cb1-1032" aria-hidden="true" tabindex="-1"></a><span class="co"># llama.</span></span>
-<span id="cb1-1033"><a href="#cb1-1033" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_use_original_mlp</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1034"><a href="#cb1-1034" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1035"><a href="#cb1-1035" aria-hidden="true" tabindex="-1"></a><span class="fu">llama4_linearized_experts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-973"><a href="#cb1-973" aria-hidden="true" tabindex="-1"></a><span class="fu">eager_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-974"><a href="#cb1-974" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-975"><a href="#cb1-975" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a custom attention implementation, used mostly for kernels.</span></span>
+<span id="cb1-976"><a href="#cb1-976" aria-hidden="true" tabindex="-1"></a><span class="fu">attn_implementation</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-977"><a href="#cb1-977" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-978"><a href="#cb1-978" aria-hidden="true" tabindex="-1"></a><span class="co"># Which experts implementation to use for MoE models,</span></span>
+<span id="cb1-979"><a href="#cb1-979" aria-hidden="true" tabindex="-1"></a><span class="fu">experts_implementation</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-980"><a href="#cb1-980" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-981"><a href="#cb1-981" aria-hidden="true" tabindex="-1"></a><span class="co"># Quantize MoE expert weights on load to reduce VRAM. Requires adapter (lora/qlora) with</span></span>
+<span id="cb1-982"><a href="#cb1-982" aria-hidden="true" tabindex="-1"></a><span class="co"># load_in_4bit or load_in_8bit. Requires CUDA (not compatible with ROCm or other</span></span>
+<span id="cb1-983"><a href="#cb1-983" aria-hidden="true" tabindex="-1"></a><span class="co"># backends). Note: total parameter count may be reported incorrectly when enabled</span></span>
+<span id="cb1-984"><a href="#cb1-984" aria-hidden="true" tabindex="-1"></a><span class="co"># (trainable param count is correct).</span></span>
+<span id="cb1-985"><a href="#cb1-985" aria-hidden="true" tabindex="-1"></a><span class="fu">quantize_moe_experts</span><span class="kw">:</span><span class="at"> bool = False</span></span>
+<span id="cb1-986"><a href="#cb1-986" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-987"><a href="#cb1-987" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399</span></span>
+<span id="cb1-988"><a href="#cb1-988" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-989"><a href="#cb1-989" aria-hidden="true" tabindex="-1"></a><span class="co"># Scaling factor for SSMax attention. Default is 0.43</span></span>
+<span id="cb1-990"><a href="#cb1-990" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-991"><a href="#cb1-991" aria-hidden="true" tabindex="-1"></a><span class="co"># Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better</span></span>
+<span id="cb1-992"><a href="#cb1-992" aria-hidden="true" tabindex="-1"></a><span class="co"># length generalization.</span></span>
+<span id="cb1-993"><a href="#cb1-993" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax_bias</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-994"><a href="#cb1-994" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-995"><a href="#cb1-995" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_cross_entropy_loss</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-996"><a href="#cb1-996" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-997"><a href="#cb1-997" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_qkv</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-998"><a href="#cb1-998" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_o</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-999"><a href="#cb1-999" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rms_norm</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1000"><a href="#cb1-1000" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rope</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1001"><a href="#cb1-1001" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1002"><a href="#cb1-1002" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-1003"><a href="#cb1-1003" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-1004"><a href="#cb1-1004" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_mlp_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1005"><a href="#cb1-1005" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-1006"><a href="#cb1-1006" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-1007"><a href="#cb1-1007" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_qkv_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1008"><a href="#cb1-1008" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-1009"><a href="#cb1-1009" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-1010"><a href="#cb1-1010" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_o_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1011"><a href="#cb1-1011" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd function for embedding layers. See:</span></span>
+<span id="cb1-1012"><a href="#cb1-1012" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-1013"><a href="#cb1-1013" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_embedding_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1014"><a href="#cb1-1014" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1015"><a href="#cb1-1015" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use chunked cross entropy loss for memory efficiency</span></span>
+<span id="cb1-1016"><a href="#cb1-1016" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1017"><a href="#cb1-1017" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of chunks to use for chunked cross entropy loss</span></span>
+<span id="cb1-1018"><a href="#cb1-1018" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1019"><a href="#cb1-1019" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable Entropy-Aware Focal Training loss (EAFT)</span></span>
+<span id="cb1-1020"><a href="#cb1-1020" aria-hidden="true" tabindex="-1"></a><span class="fu">use_eaft</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1021"><a href="#cb1-1021" aria-hidden="true" tabindex="-1"></a><span class="co"># Exponent for entropy weighting in EAFT (default: 1.0)</span></span>
+<span id="cb1-1022"><a href="#cb1-1022" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_alpha</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
+<span id="cb1-1023"><a href="#cb1-1023" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of top logits for entropy approximation (default: 20)</span></span>
+<span id="cb1-1024"><a href="#cb1-1024" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_k</span><span class="kw">:</span><span class="at"> int | None = 20</span></span>
+<span id="cb1-1025"><a href="#cb1-1025" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1026"><a href="#cb1-1026" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ALST tiled mlp for memory efficient long context</span></span>
+<span id="cb1-1027"><a href="#cb1-1027" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1028"><a href="#cb1-1028" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1029"><a href="#cb1-1029" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of shards to use for ALST tiled mlp. If unset, it will be set based on</span></span>
+<span id="cb1-1030"><a href="#cb1-1030" aria-hidden="true" tabindex="-1"></a><span class="co"># seqlen/hidden_size</span></span>
+<span id="cb1-1031"><a href="#cb1-1031" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_num_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1032"><a href="#cb1-1032" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1033"><a href="#cb1-1033" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on</span></span>
+<span id="cb1-1034"><a href="#cb1-1034" aria-hidden="true" tabindex="-1"></a><span class="co"># llama.</span></span>
+<span id="cb1-1035"><a href="#cb1-1035" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_use_original_mlp</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
 <span id="cb1-1036"><a href="#cb1-1036" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1037"><a href="#cb1-1037" aria-hidden="true" tabindex="-1"></a><span class="co"># Deepspeed config path. e.g., deepspeed_configs/zero3.json</span></span>
-<span id="cb1-1038"><a href="#cb1-1038" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
-<span id="cb1-1039"><a href="#cb1-1039" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use deepcompile for faster training with deepspeed</span></span>
-<span id="cb1-1040"><a href="#cb1-1040" aria-hidden="true" tabindex="-1"></a><span class="fu">deepcompile</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1041"><a href="#cb1-1041" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration</span></span>
-<span id="cb1-1042"><a href="#cb1-1042" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1043"><a href="#cb1-1043" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1044"><a href="#cb1-1044" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration options</span></span>
-<span id="cb1-1045"><a href="#cb1-1045" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span><span class="at"> FSDPConfig | None</span></span>
-<span id="cb1-1046"><a href="#cb1-1046" aria-hidden="true" tabindex="-1"></a><span class="co">  # For FSDPConfig:</span></span>
-<span id="cb1-1047"><a href="#cb1-1047" aria-hidden="true" tabindex="-1"></a><span class="co">  # FSDP version</span></span>
-<span id="cb1-1048"><a href="#cb1-1048" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1049"><a href="#cb1-1049" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable activation checkpointing to reduce memory usage during forward passes</span></span>
-<span id="cb1-1050"><a href="#cb1-1050" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">activation_checkpointing</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1051"><a href="#cb1-1051" aria-hidden="true" tabindex="-1"></a><span class="co">  # Offload parameters to CPU to reduce GPU memory usage</span></span>
-<span id="cb1-1052"><a href="#cb1-1052" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">offload_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1053"><a href="#cb1-1053" aria-hidden="true" tabindex="-1"></a><span class="co">  # Synchronize module states across all processes</span></span>
-<span id="cb1-1054"><a href="#cb1-1054" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">sync_module_states</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1055"><a href="#cb1-1055" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable CPU RAM efficient loading to reduce memory usage during model loading</span></span>
-<span id="cb1-1056"><a href="#cb1-1056" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1057"><a href="#cb1-1057" aria-hidden="true" tabindex="-1"></a><span class="co">  # Disabling this enables swap memory usage for resource-constrained setups when</span></span>
-<span id="cb1-1058"><a href="#cb1-1058" aria-hidden="true" tabindex="-1"></a><span class="co">  # offload_params is enabled.</span></span>
-<span id="cb1-1059"><a href="#cb1-1059" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_offload_pin_memory</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1060"><a href="#cb1-1060" aria-hidden="true" tabindex="-1"></a><span class="co">  # Use original parameters instead of flattened parameters</span></span>
-<span id="cb1-1061"><a href="#cb1-1061" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">use_orig_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1062"><a href="#cb1-1062" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1063"><a href="#cb1-1063" aria-hidden="true" tabindex="-1"></a><span class="co">  # Type of state dict to use for saving/loading checkpoints</span></span>
-<span id="cb1-1064"><a href="#cb1-1064" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
-<span id="cb1-1065"><a href="#cb1-1065" aria-hidden="true" tabindex="-1"></a><span class="co">  # Final state dict type to use after training completion</span></span>
-<span id="cb1-1066"><a href="#cb1-1066" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
-<span id="cb1-1067"><a href="#cb1-1067" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1068"><a href="#cb1-1068" aria-hidden="true" tabindex="-1"></a><span class="co">  # Policy for automatically wrapping modules with FSDP</span></span>
-<span id="cb1-1069"><a href="#cb1-1069" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">auto_wrap_policy</span><span class="kw">:</span><span class="at"> Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None</span></span>
-<span id="cb1-1070"><a href="#cb1-1070" aria-hidden="true" tabindex="-1"></a><span class="co">  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')</span></span>
-<span id="cb1-1071"><a href="#cb1-1071" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1072"><a href="#cb1-1072" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1073"><a href="#cb1-1073" aria-hidden="true" tabindex="-1"></a><span class="co">  # Reshard parameters after forward pass to save memory</span></span>
-<span id="cb1-1074"><a href="#cb1-1074" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">reshard_after_forward</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1075"><a href="#cb1-1075" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')</span></span>
-<span id="cb1-1076"><a href="#cb1-1076" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">mixed_precision_policy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1077"><a href="#cb1-1077" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1078"><a href="#cb1-1078" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP version</span></span>
-<span id="cb1-1079"><a href="#cb1-1079" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1080"><a href="#cb1-1080" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
-<span id="cb1-1081"><a href="#cb1-1081" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1082"><a href="#cb1-1082" aria-hidden="true" tabindex="-1"></a><span class="co"># How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for</span></span>
-<span id="cb1-1083"><a href="#cb1-1083" aria-hidden="true" tabindex="-1"></a><span class="co"># no eval.</span></span>
-<span id="cb1-1084"><a href="#cb1-1084" aria-hidden="true" tabindex="-1"></a><span class="fu">val_set_size</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-1085"><a href="#cb1-1085" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1086"><a href="#cb1-1086" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to shard across. If not set, will use all available devices.</span></span>
-<span id="cb1-1087"><a href="#cb1-1087" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_shard_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1088"><a href="#cb1-1088" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to replicate across.</span></span>
-<span id="cb1-1089"><a href="#cb1-1089" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_replicate_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1090"><a href="#cb1-1090" aria-hidden="true" tabindex="-1"></a><span class="co"># Deprecated: use `context_parallel_size` instead</span></span>
-<span id="cb1-1091"><a href="#cb1-1091" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_parallel_degree</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1092"><a href="#cb1-1092" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to a divisor of the number of GPUs available to split sequences into chunks of</span></span>
-<span id="cb1-1093"><a href="#cb1-1093" aria-hidden="true" tabindex="-1"></a><span class="co"># equal size. Use in long context training to prevent OOM when sequences cannot fit into</span></span>
-<span id="cb1-1094"><a href="#cb1-1094" aria-hidden="true" tabindex="-1"></a><span class="co"># a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each</span></span>
-<span id="cb1-1095"><a href="#cb1-1095" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized</span></span>
-<span id="cb1-1096"><a href="#cb1-1096" aria-hidden="true" tabindex="-1"></a><span class="co"># subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more</span></span>
-<span id="cb1-1097"><a href="#cb1-1097" aria-hidden="true" tabindex="-1"></a><span class="co"># details.</span></span>
-<span id="cb1-1098"><a href="#cb1-1098" aria-hidden="true" tabindex="-1"></a><span class="fu">context_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1099"><a href="#cb1-1099" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional; strides across the key dimension. Larger values use more memory but should</span></span>
-<span id="cb1-1100"><a href="#cb1-1100" aria-hidden="true" tabindex="-1"></a><span class="co"># make training faster. Must evenly divide the number of KV heads in your model.</span></span>
-<span id="cb1-1101"><a href="#cb1-1101" aria-hidden="true" tabindex="-1"></a><span class="fu">heads_k_stride</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1102"><a href="#cb1-1102" aria-hidden="true" tabindex="-1"></a><span class="co"># One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to</span></span>
-<span id="cb1-1103"><a href="#cb1-1103" aria-hidden="true" tabindex="-1"></a><span class="co"># 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing</span></span>
-<span id="cb1-1104"><a href="#cb1-1104" aria-hidden="true" tabindex="-1"></a><span class="co"># case.</span></span>
-<span id="cb1-1105"><a href="#cb1-1105" aria-hidden="true" tabindex="-1"></a><span class="fu">ring_attn_func</span><span class="kw">:</span><span class="at"> RingAttnFunc | None</span></span>
-<span id="cb1-1106"><a href="#cb1-1106" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.</span></span>
-<span id="cb1-1107"><a href="#cb1-1107" aria-hidden="true" tabindex="-1"></a><span class="fu">tensor_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1108"><a href="#cb1-1108" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1109"><a href="#cb1-1109" aria-hidden="true" tabindex="-1"></a><span class="co"># Add or change special tokens. If you add tokens here, you don't need to add them to</span></span>
-<span id="cb1-1110"><a href="#cb1-1110" aria-hidden="true" tabindex="-1"></a><span class="co"># the `tokens` list.</span></span>
-<span id="cb1-1111"><a href="#cb1-1111" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span><span class="at"> SpecialTokensConfig | None</span></span>
-<span id="cb1-1112"><a href="#cb1-1112" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SpecialTokensConfig:</span></span>
-<span id="cb1-1113"><a href="#cb1-1113" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">bos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1114"><a href="#cb1-1114" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1115"><a href="#cb1-1115" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pad_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1116"><a href="#cb1-1116" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">unk_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1117"><a href="#cb1-1117" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">additional_special_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1118"><a href="#cb1-1118" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1119"><a href="#cb1-1119" aria-hidden="true" tabindex="-1"></a><span class="co"># Add extra tokens to the tokenizer</span></span>
-<span id="cb1-1120"><a href="#cb1-1120" aria-hidden="true" tabindex="-1"></a><span class="fu">tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1121"><a href="#cb1-1121" aria-hidden="true" tabindex="-1"></a><span class="co"># Mapping token_id to new_token_string to override reserved added_tokens in the</span></span>
-<span id="cb1-1122"><a href="#cb1-1122" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer. Only works for tokens that are not part of the base vocab (aka are</span></span>
-<span id="cb1-1123"><a href="#cb1-1123" aria-hidden="true" tabindex="-1"></a><span class="co"># added_tokens). Can be checked if they exist in tokenizer.json added_tokens.</span></span>
-<span id="cb1-1124"><a href="#cb1-1124" aria-hidden="true" tabindex="-1"></a><span class="fu">added_tokens_overrides</span><span class="kw">:</span><span class="at"> dict[int, str] | None</span></span>
-<span id="cb1-1125"><a href="#cb1-1125" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1126"><a href="#cb1-1126" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use torch.compile and which backend to use. setting to `auto` will enable</span></span>
-<span id="cb1-1127"><a href="#cb1-1127" aria-hidden="true" tabindex="-1"></a><span class="co"># torch compile when torch&gt;=2.6.0</span></span>
-<span id="cb1-1128"><a href="#cb1-1128" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
-<span id="cb1-1129"><a href="#cb1-1129" aria-hidden="true" tabindex="-1"></a><span class="co"># Backend to use for torch.compile</span></span>
-<span id="cb1-1130"><a href="#cb1-1130" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_backend</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1131"><a href="#cb1-1131" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_mode</span><span class="kw">:</span><span class="at"> Literal['default', 'reduce-overhead', 'max-autotune'] | None</span></span>
-<span id="cb1-1132"><a href="#cb1-1132" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1133"><a href="#cb1-1133" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum number of iterations to train for. It precedes num_epochs which means that if</span></span>
-<span id="cb1-1134"><a href="#cb1-1134" aria-hidden="true" tabindex="-1"></a><span class="co"># both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;</span></span>
-<span id="cb1-1135"><a href="#cb1-1135" aria-hidden="true" tabindex="-1"></a><span class="co"># `num_epochs: 2` and `max_steps: 100` will train for 100 steps</span></span>
-<span id="cb1-1136"><a href="#cb1-1136" aria-hidden="true" tabindex="-1"></a><span class="fu">max_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1137"><a href="#cb1-1137" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of warmup steps. Cannot use with warmup_ratio</span></span>
-<span id="cb1-1138"><a href="#cb1-1138" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1139"><a href="#cb1-1139" aria-hidden="true" tabindex="-1"></a><span class="co"># Warmup ratio. Cannot use with warmup_steps</span></span>
-<span id="cb1-1140"><a href="#cb1-1140" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1141"><a href="#cb1-1141" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to eval at each epoch, integer for every N steps. float for fraction of</span></span>
-<span id="cb1-1142"><a href="#cb1-1142" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
-<span id="cb1-1143"><a href="#cb1-1143" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
-<span id="cb1-1144"><a href="#cb1-1144" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to run evals, mutually exclusive with eval_steps</span></span>
-<span id="cb1-1145"><a href="#cb1-1145" aria-hidden="true" tabindex="-1"></a><span class="fu">evals_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1146"><a href="#cb1-1146" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer</span></span>
-<span id="cb1-1147"><a href="#cb1-1147" aria-hidden="true" tabindex="-1"></a><span class="co"># from `eval_steps`</span></span>
-<span id="cb1-1148"><a href="#cb1-1148" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1149"><a href="#cb1-1149" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1150"><a href="#cb1-1150" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to save at each epoch, integer for every N steps. float for fraction of</span></span>
-<span id="cb1-1151"><a href="#cb1-1151" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
-<span id="cb1-1152"><a href="#cb1-1152" aria-hidden="true" tabindex="-1"></a><span class="fu">save_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
-<span id="cb1-1153"><a href="#cb1-1153" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to save a checkpoint, mutually exclusive with save_steps</span></span>
-<span id="cb1-1154"><a href="#cb1-1154" aria-hidden="true" tabindex="-1"></a><span class="fu">saves_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1155"><a href="#cb1-1155" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better</span></span>
-<span id="cb1-1156"><a href="#cb1-1156" aria-hidden="true" tabindex="-1"></a><span class="co"># result is achieved, leave empty to infer from `save_steps`</span></span>
-<span id="cb1-1157"><a href="#cb1-1157" aria-hidden="true" tabindex="-1"></a><span class="fu">save_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1158"><a href="#cb1-1158" aria-hidden="true" tabindex="-1"></a><span class="co"># Checkpoints saved at a time</span></span>
-<span id="cb1-1159"><a href="#cb1-1159" aria-hidden="true" tabindex="-1"></a><span class="fu">save_total_limit</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1160"><a href="#cb1-1160" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to checkpoint a model after the first step of training. Defaults to False.</span></span>
-<span id="cb1-1161"><a href="#cb1-1161" aria-hidden="true" tabindex="-1"></a><span class="fu">save_first_step</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1162"><a href="#cb1-1162" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1163"><a href="#cb1-1163" aria-hidden="true" tabindex="-1"></a><span class="co"># Logging frequency</span></span>
-<span id="cb1-1164"><a href="#cb1-1164" aria-hidden="true" tabindex="-1"></a><span class="fu">logging_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1165"><a href="#cb1-1165" aria-hidden="true" tabindex="-1"></a><span class="co"># Stop training after this many evaluation losses have increased in a row. https://huggi</span></span>
-<span id="cb1-1166"><a href="#cb1-1166" aria-hidden="true" tabindex="-1"></a><span class="co"># ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin</span></span>
-<span id="cb1-1167"><a href="#cb1-1167" aria-hidden="true" tabindex="-1"></a><span class="co"># gCallback</span></span>
-<span id="cb1-1168"><a href="#cb1-1168" aria-hidden="true" tabindex="-1"></a><span class="fu">early_stopping_patience</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1169"><a href="#cb1-1169" aria-hidden="true" tabindex="-1"></a><span class="fu">load_best_model_at_end</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1170"><a href="#cb1-1170" aria-hidden="true" tabindex="-1"></a><span class="co"># Save only the model weights, skipping the optimizer. Using this means you can't resume</span></span>
-<span id="cb1-1171"><a href="#cb1-1171" aria-hidden="true" tabindex="-1"></a><span class="co"># from checkpoints.</span></span>
-<span id="cb1-1172"><a href="#cb1-1172" aria-hidden="true" tabindex="-1"></a><span class="fu">save_only_model</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1173"><a href="#cb1-1173" aria-hidden="true" tabindex="-1"></a><span class="co"># Use tensorboard for logging</span></span>
-<span id="cb1-1174"><a href="#cb1-1174" aria-hidden="true" tabindex="-1"></a><span class="fu">use_tensorboard</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1175"><a href="#cb1-1175" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable the pytorch profiler to capture the first N steps of training to the</span></span>
-<span id="cb1-1176"><a href="#cb1-1176" aria-hidden="true" tabindex="-1"></a><span class="co"># output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more</span></span>
-<span id="cb1-1177"><a href="#cb1-1177" aria-hidden="true" tabindex="-1"></a><span class="co"># information. Snapshots can be visualized @ https://pytorch.org/memory_viz</span></span>
-<span id="cb1-1178"><a href="#cb1-1178" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1179"><a href="#cb1-1179" aria-hidden="true" tabindex="-1"></a><span class="co"># Which step to start the profiler at. Useful for only capturing a few steps mid-run.</span></span>
-<span id="cb1-1180"><a href="#cb1-1180" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps_start</span><span class="kw">:</span><span class="at"> int | None = 0</span></span>
-<span id="cb1-1181"><a href="#cb1-1181" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second at the end of training. This is not</span></span>
-<span id="cb1-1182"><a href="#cb1-1182" aria-hidden="true" tabindex="-1"></a><span class="co"># supported with pre-training datasets.</span></span>
-<span id="cb1-1183"><a href="#cb1-1183" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tokens_per_second</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1184"><a href="#cb1-1184" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second per-gpu during training by measuring</span></span>
-<span id="cb1-1185"><a href="#cb1-1185" aria-hidden="true" tabindex="-1"></a><span class="co"># throughput of non-padding tokens.</span></span>
-<span id="cb1-1186"><a href="#cb1-1186" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tkps</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1187"><a href="#cb1-1187" aria-hidden="true" tabindex="-1"></a><span class="co"># NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to</span></span>
-<span id="cb1-1188"><a href="#cb1-1188" aria-hidden="true" tabindex="-1"></a><span class="co"># add noise to embeddings. Currently only supported on Llama and Mistral</span></span>
-<span id="cb1-1189"><a href="#cb1-1189" aria-hidden="true" tabindex="-1"></a><span class="fu">neftune_noise_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1190"><a href="#cb1-1190" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1191"><a href="#cb1-1191" aria-hidden="true" tabindex="-1"></a><span class="co"># Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to</span></span>
-<span id="cb1-1192"><a href="#cb1-1192" aria-hidden="true" tabindex="-1"></a><span class="co"># `beta` in `ORPOConfig` due to trl mapping.</span></span>
-<span id="cb1-1193"><a href="#cb1-1193" aria-hidden="true" tabindex="-1"></a><span class="fu">orpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1194"><a href="#cb1-1194" aria-hidden="true" tabindex="-1"></a><span class="co"># Target reward margin for the SimPO loss</span></span>
-<span id="cb1-1195"><a href="#cb1-1195" aria-hidden="true" tabindex="-1"></a><span class="fu">simpo_gamma</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1196"><a href="#cb1-1196" aria-hidden="true" tabindex="-1"></a><span class="co"># Weight of the BC regularizer</span></span>
-<span id="cb1-1197"><a href="#cb1-1197" aria-hidden="true" tabindex="-1"></a><span class="fu">cpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1198"><a href="#cb1-1198" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1199"><a href="#cb1-1199" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for desirable loss term in KTO loss</span></span>
-<span id="cb1-1200"><a href="#cb1-1200" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_desirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1201"><a href="#cb1-1201" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for undesirable loss term in KTO loss</span></span>
-<span id="cb1-1202"><a href="#cb1-1202" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_undesirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1203"><a href="#cb1-1203" aria-hidden="true" tabindex="-1"></a><span class="co"># The beta parameter for the RL training</span></span>
-<span id="cb1-1204"><a href="#cb1-1204" aria-hidden="true" tabindex="-1"></a><span class="fu">rl_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1205"><a href="#cb1-1205" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1206"><a href="#cb1-1206" aria-hidden="true" tabindex="-1"></a><span class="co"># Defines the max memory usage per gpu on the system. Passed through to transformers</span></span>
-<span id="cb1-1207"><a href="#cb1-1207" aria-hidden="true" tabindex="-1"></a><span class="co"># when loading the model.</span></span>
-<span id="cb1-1208"><a href="#cb1-1208" aria-hidden="true" tabindex="-1"></a><span class="fu">max_memory</span><span class="kw">:</span><span class="at"> dict[int | Literal['cpu', 'disk'], int | str] | None</span></span>
-<span id="cb1-1209"><a href="#cb1-1209" aria-hidden="true" tabindex="-1"></a><span class="co"># Limit the memory for all available GPUs to this amount (if an integer, expressed in</span></span>
-<span id="cb1-1210"><a href="#cb1-1210" aria-hidden="true" tabindex="-1"></a><span class="co"># gigabytes); default: unset</span></span>
-<span id="cb1-1211"><a href="#cb1-1211" aria-hidden="true" tabindex="-1"></a><span class="fu">gpu_memory_limit</span><span class="kw">:</span><span class="at"> int | str | None</span></span>
-<span id="cb1-1212"><a href="#cb1-1212" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use low_cpu_mem_usage</span></span>
-<span id="cb1-1213"><a href="#cb1-1213" aria-hidden="true" tabindex="-1"></a><span class="fu">low_cpu_mem_usage</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1214"><a href="#cb1-1214" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1215"><a href="#cb1-1215" aria-hidden="true" tabindex="-1"></a><span class="co"># The name of the chat template to use for training, following values are supported:</span></span>
-<span id="cb1-1216"><a href="#cb1-1216" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default: Uses the chat template that is available in the</span></span>
-<span id="cb1-1217"><a href="#cb1-1217" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_config.json. If the chat template is not available in the tokenizer, it will</span></span>
-<span id="cb1-1218"><a href="#cb1-1218" aria-hidden="true" tabindex="-1"></a><span class="co"># raise an error. This is the default value.</span></span>
-<span id="cb1-1219"><a href="#cb1-1219" aria-hidden="true" tabindex="-1"></a><span class="co"># alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
-<span id="cb1-1220"><a href="#cb1-1220" aria-hidden="true" tabindex="-1"></a><span class="co"># are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
-<span id="cb1-1221"><a href="#cb1-1221" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.</span></span>
-<span id="cb1-1222"><a href="#cb1-1222" aria-hidden="true" tabindex="-1"></a><span class="co"># E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not</span></span>
-<span id="cb1-1223"><a href="#cb1-1223" aria-hidden="true" tabindex="-1"></a><span class="co"># available in the tokenizer. jinja: Uses a custom jinja template for the chat template.</span></span>
-<span id="cb1-1224"><a href="#cb1-1224" aria-hidden="true" tabindex="-1"></a><span class="co"># The custom jinja template should be provided in the chat_template_jinja field. The</span></span>
-<span id="cb1-1225"><a href="#cb1-1225" aria-hidden="true" tabindex="-1"></a><span class="co"># selected chat template will be saved to the tokenizer_config.json for easier</span></span>
-<span id="cb1-1226"><a href="#cb1-1226" aria-hidden="true" tabindex="-1"></a><span class="co"># inferencing</span></span>
-<span id="cb1-1227"><a href="#cb1-1227" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None</span></span>
-<span id="cb1-1228"><a href="#cb1-1228" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom jinja template or path to jinja file for chat template. This will be only used</span></span>
-<span id="cb1-1229"><a href="#cb1-1229" aria-hidden="true" tabindex="-1"></a><span class="co"># if chat_template is set to `jinja` or `null` (in which case chat_template is</span></span>
-<span id="cb1-1230"><a href="#cb1-1230" aria-hidden="true" tabindex="-1"></a><span class="co"># automatically set to `jinja`). Default is null.</span></span>
-<span id="cb1-1231"><a href="#cb1-1231" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1232"><a href="#cb1-1232" aria-hidden="true" tabindex="-1"></a><span class="co"># Additional kwargs to pass to the chat template. This is useful for customizing the</span></span>
-<span id="cb1-1233"><a href="#cb1-1233" aria-hidden="true" tabindex="-1"></a><span class="co"># chat template. For example, you can pass `thinking=False` to add a generation prompt</span></span>
-<span id="cb1-1234"><a href="#cb1-1234" aria-hidden="true" tabindex="-1"></a><span class="co"># to the chat template.</span></span>
-<span id="cb1-1235"><a href="#cb1-1235" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1236"><a href="#cb1-1236" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the</span></span>
-<span id="cb1-1237"><a href="#cb1-1237" aria-hidden="true" tabindex="-1"></a><span class="co"># boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',</span></span>
-<span id="cb1-1238"><a href="#cb1-1238" aria-hidden="true" tabindex="-1"></a><span class="co"># '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is</span></span>
-<span id="cb1-1239"><a href="#cb1-1239" aria-hidden="true" tabindex="-1"></a><span class="co"># useful for templates that use multiple delimiter tokens.</span></span>
-<span id="cb1-1240"><a href="#cb1-1240" aria-hidden="true" tabindex="-1"></a><span class="fu">eot_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1241"><a href="#cb1-1241" aria-hidden="true" tabindex="-1"></a><span class="co"># Changes the default system message. Currently only supports chatml.</span></span>
-<span id="cb1-1242"><a href="#cb1-1242" aria-hidden="true" tabindex="-1"></a><span class="fu">default_system_message</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1243"><a href="#cb1-1243" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1244"><a href="#cb1-1244" aria-hidden="true" tabindex="-1"></a><span class="co"># Token index or indices to adjust embedding weights to the mean of the other tokens.</span></span>
-<span id="cb1-1245"><a href="#cb1-1245" aria-hidden="true" tabindex="-1"></a><span class="co"># This is useful when the model has untrained embeddings.</span></span>
-<span id="cb1-1246"><a href="#cb1-1246" aria-hidden="true" tabindex="-1"></a><span class="fu">fix_untrained_tokens</span><span class="kw">:</span><span class="at"> int | list[int] | None</span></span>
-<span id="cb1-1247"><a href="#cb1-1247" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1248"><a href="#cb1-1248" aria-hidden="true" tabindex="-1"></a><span class="fu">is_preprocess</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1249"><a href="#cb1-1249" aria-hidden="true" tabindex="-1"></a><span class="fu">preprocess_iterable</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1250"><a href="#cb1-1250" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1251"><a href="#cb1-1251" aria-hidden="true" tabindex="-1"></a><span class="co"># Total number of tokens - internal use</span></span>
-<span id="cb1-1252"><a href="#cb1-1252" aria-hidden="true" tabindex="-1"></a><span class="fu">total_num_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1253"><a href="#cb1-1253" aria-hidden="true" tabindex="-1"></a><span class="fu">total_supervised_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1254"><a href="#cb1-1254" aria-hidden="true" tabindex="-1"></a><span class="co"># You can set these packing optimizations AFTER starting a training at least once. The</span></span>
-<span id="cb1-1255"><a href="#cb1-1255" aria-hidden="true" tabindex="-1"></a><span class="co"># trainer will provide recommended values for these values.</span></span>
-<span id="cb1-1256"><a href="#cb1-1256" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_eff_est</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1257"><a href="#cb1-1257" aria-hidden="true" tabindex="-1"></a><span class="fu">axolotl_config_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1258"><a href="#cb1-1258" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1259"><a href="#cb1-1259" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1260"><a href="#cb1-1260" aria-hidden="true" tabindex="-1"></a><span class="fu">is_falcon_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1037"><a href="#cb1-1037" aria-hidden="true" tabindex="-1"></a><span class="fu">llama4_linearized_experts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1038"><a href="#cb1-1038" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1039"><a href="#cb1-1039" aria-hidden="true" tabindex="-1"></a><span class="co"># Deepspeed config path. e.g., deepspeed_configs/zero3.json</span></span>
+<span id="cb1-1040"><a href="#cb1-1040" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
+<span id="cb1-1041"><a href="#cb1-1041" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use deepcompile for faster training with deepspeed</span></span>
+<span id="cb1-1042"><a href="#cb1-1042" aria-hidden="true" tabindex="-1"></a><span class="fu">deepcompile</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1043"><a href="#cb1-1043" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration</span></span>
+<span id="cb1-1044"><a href="#cb1-1044" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1045"><a href="#cb1-1045" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1046"><a href="#cb1-1046" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration options</span></span>
+<span id="cb1-1047"><a href="#cb1-1047" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span><span class="at"> FSDPConfig | None</span></span>
+<span id="cb1-1048"><a href="#cb1-1048" aria-hidden="true" tabindex="-1"></a><span class="co">  # For FSDPConfig:</span></span>
+<span id="cb1-1049"><a href="#cb1-1049" aria-hidden="true" tabindex="-1"></a><span class="co">  # FSDP version</span></span>
+<span id="cb1-1050"><a href="#cb1-1050" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1051"><a href="#cb1-1051" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable activation checkpointing to reduce memory usage during forward passes</span></span>
+<span id="cb1-1052"><a href="#cb1-1052" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">activation_checkpointing</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1053"><a href="#cb1-1053" aria-hidden="true" tabindex="-1"></a><span class="co">  # Offload parameters to CPU to reduce GPU memory usage</span></span>
+<span id="cb1-1054"><a href="#cb1-1054" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">offload_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1055"><a href="#cb1-1055" aria-hidden="true" tabindex="-1"></a><span class="co">  # Synchronize module states across all processes</span></span>
+<span id="cb1-1056"><a href="#cb1-1056" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">sync_module_states</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1057"><a href="#cb1-1057" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable CPU RAM efficient loading to reduce memory usage during model loading</span></span>
+<span id="cb1-1058"><a href="#cb1-1058" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1059"><a href="#cb1-1059" aria-hidden="true" tabindex="-1"></a><span class="co">  # Disabling this enables swap memory usage for resource-constrained setups when</span></span>
+<span id="cb1-1060"><a href="#cb1-1060" aria-hidden="true" tabindex="-1"></a><span class="co">  # offload_params is enabled.</span></span>
+<span id="cb1-1061"><a href="#cb1-1061" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_offload_pin_memory</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1062"><a href="#cb1-1062" aria-hidden="true" tabindex="-1"></a><span class="co">  # Use original parameters instead of flattened parameters</span></span>
+<span id="cb1-1063"><a href="#cb1-1063" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">use_orig_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1064"><a href="#cb1-1064" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1065"><a href="#cb1-1065" aria-hidden="true" tabindex="-1"></a><span class="co">  # Type of state dict to use for saving/loading checkpoints</span></span>
+<span id="cb1-1066"><a href="#cb1-1066" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
+<span id="cb1-1067"><a href="#cb1-1067" aria-hidden="true" tabindex="-1"></a><span class="co">  # Final state dict type to use after training completion</span></span>
+<span id="cb1-1068"><a href="#cb1-1068" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
+<span id="cb1-1069"><a href="#cb1-1069" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1070"><a href="#cb1-1070" aria-hidden="true" tabindex="-1"></a><span class="co">  # Policy for automatically wrapping modules with FSDP</span></span>
+<span id="cb1-1071"><a href="#cb1-1071" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">auto_wrap_policy</span><span class="kw">:</span><span class="at"> Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None</span></span>
+<span id="cb1-1072"><a href="#cb1-1072" aria-hidden="true" tabindex="-1"></a><span class="co">  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')</span></span>
+<span id="cb1-1073"><a href="#cb1-1073" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1074"><a href="#cb1-1074" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1075"><a href="#cb1-1075" aria-hidden="true" tabindex="-1"></a><span class="co">  # Reshard parameters after forward pass to save memory</span></span>
+<span id="cb1-1076"><a href="#cb1-1076" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">reshard_after_forward</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1077"><a href="#cb1-1077" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')</span></span>
+<span id="cb1-1078"><a href="#cb1-1078" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">mixed_precision_policy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1079"><a href="#cb1-1079" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1080"><a href="#cb1-1080" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP version</span></span>
+<span id="cb1-1081"><a href="#cb1-1081" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1082"><a href="#cb1-1082" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
+<span id="cb1-1083"><a href="#cb1-1083" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1084"><a href="#cb1-1084" aria-hidden="true" tabindex="-1"></a><span class="co"># How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for</span></span>
+<span id="cb1-1085"><a href="#cb1-1085" aria-hidden="true" tabindex="-1"></a><span class="co"># no eval.</span></span>
+<span id="cb1-1086"><a href="#cb1-1086" aria-hidden="true" tabindex="-1"></a><span class="fu">val_set_size</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-1087"><a href="#cb1-1087" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1088"><a href="#cb1-1088" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to shard across. If not set, will use all available devices.</span></span>
+<span id="cb1-1089"><a href="#cb1-1089" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_shard_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1090"><a href="#cb1-1090" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to replicate across.</span></span>
+<span id="cb1-1091"><a href="#cb1-1091" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_replicate_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1092"><a href="#cb1-1092" aria-hidden="true" tabindex="-1"></a><span class="co"># Deprecated: use `context_parallel_size` instead</span></span>
+<span id="cb1-1093"><a href="#cb1-1093" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_parallel_degree</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1094"><a href="#cb1-1094" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to a divisor of the number of GPUs available to split sequences into chunks of</span></span>
+<span id="cb1-1095"><a href="#cb1-1095" aria-hidden="true" tabindex="-1"></a><span class="co"># equal size. Use in long context training to prevent OOM when sequences cannot fit into</span></span>
+<span id="cb1-1096"><a href="#cb1-1096" aria-hidden="true" tabindex="-1"></a><span class="co"># a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each</span></span>
+<span id="cb1-1097"><a href="#cb1-1097" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized</span></span>
+<span id="cb1-1098"><a href="#cb1-1098" aria-hidden="true" tabindex="-1"></a><span class="co"># subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more</span></span>
+<span id="cb1-1099"><a href="#cb1-1099" aria-hidden="true" tabindex="-1"></a><span class="co"># details.</span></span>
+<span id="cb1-1100"><a href="#cb1-1100" aria-hidden="true" tabindex="-1"></a><span class="fu">context_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1101"><a href="#cb1-1101" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional; strides across the key dimension. Larger values use more memory but should</span></span>
+<span id="cb1-1102"><a href="#cb1-1102" aria-hidden="true" tabindex="-1"></a><span class="co"># make training faster. Must evenly divide the number of KV heads in your model.</span></span>
+<span id="cb1-1103"><a href="#cb1-1103" aria-hidden="true" tabindex="-1"></a><span class="fu">heads_k_stride</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1104"><a href="#cb1-1104" aria-hidden="true" tabindex="-1"></a><span class="co"># One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to</span></span>
+<span id="cb1-1105"><a href="#cb1-1105" aria-hidden="true" tabindex="-1"></a><span class="co"># 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing</span></span>
+<span id="cb1-1106"><a href="#cb1-1106" aria-hidden="true" tabindex="-1"></a><span class="co"># case.</span></span>
+<span id="cb1-1107"><a href="#cb1-1107" aria-hidden="true" tabindex="-1"></a><span class="fu">ring_attn_func</span><span class="kw">:</span><span class="at"> RingAttnFunc | None</span></span>
+<span id="cb1-1108"><a href="#cb1-1108" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.</span></span>
+<span id="cb1-1109"><a href="#cb1-1109" aria-hidden="true" tabindex="-1"></a><span class="fu">tensor_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1110"><a href="#cb1-1110" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1111"><a href="#cb1-1111" aria-hidden="true" tabindex="-1"></a><span class="co"># Add or change special tokens. If you add tokens here, you don't need to add them to</span></span>
+<span id="cb1-1112"><a href="#cb1-1112" aria-hidden="true" tabindex="-1"></a><span class="co"># the `tokens` list.</span></span>
+<span id="cb1-1113"><a href="#cb1-1113" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span><span class="at"> SpecialTokensConfig | None</span></span>
+<span id="cb1-1114"><a href="#cb1-1114" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SpecialTokensConfig:</span></span>
+<span id="cb1-1115"><a href="#cb1-1115" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">bos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1116"><a href="#cb1-1116" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1117"><a href="#cb1-1117" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pad_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1118"><a href="#cb1-1118" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">unk_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1119"><a href="#cb1-1119" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">additional_special_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1120"><a href="#cb1-1120" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1121"><a href="#cb1-1121" aria-hidden="true" tabindex="-1"></a><span class="co"># Add extra tokens to the tokenizer</span></span>
+<span id="cb1-1122"><a href="#cb1-1122" aria-hidden="true" tabindex="-1"></a><span class="fu">tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1123"><a href="#cb1-1123" aria-hidden="true" tabindex="-1"></a><span class="co"># Mapping token_id to new_token_string to override reserved added_tokens in the</span></span>
+<span id="cb1-1124"><a href="#cb1-1124" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer. Only works for tokens that are not part of the base vocab (aka are</span></span>
+<span id="cb1-1125"><a href="#cb1-1125" aria-hidden="true" tabindex="-1"></a><span class="co"># added_tokens). Can be checked if they exist in tokenizer.json added_tokens.</span></span>
+<span id="cb1-1126"><a href="#cb1-1126" aria-hidden="true" tabindex="-1"></a><span class="fu">added_tokens_overrides</span><span class="kw">:</span><span class="at"> dict[int, str] | None</span></span>
+<span id="cb1-1127"><a href="#cb1-1127" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1128"><a href="#cb1-1128" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use torch.compile and which backend to use. setting to `auto` will enable</span></span>
+<span id="cb1-1129"><a href="#cb1-1129" aria-hidden="true" tabindex="-1"></a><span class="co"># torch compile when torch&gt;=2.6.0</span></span>
+<span id="cb1-1130"><a href="#cb1-1130" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
+<span id="cb1-1131"><a href="#cb1-1131" aria-hidden="true" tabindex="-1"></a><span class="co"># Backend to use for torch.compile</span></span>
+<span id="cb1-1132"><a href="#cb1-1132" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_backend</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1133"><a href="#cb1-1133" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_mode</span><span class="kw">:</span><span class="at"> Literal['default', 'reduce-overhead', 'max-autotune'] | None</span></span>
+<span id="cb1-1134"><a href="#cb1-1134" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1135"><a href="#cb1-1135" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum number of iterations to train for. It precedes num_epochs which means that if</span></span>
+<span id="cb1-1136"><a href="#cb1-1136" aria-hidden="true" tabindex="-1"></a><span class="co"># both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;</span></span>
+<span id="cb1-1137"><a href="#cb1-1137" aria-hidden="true" tabindex="-1"></a><span class="co"># `num_epochs: 2` and `max_steps: 100` will train for 100 steps</span></span>
+<span id="cb1-1138"><a href="#cb1-1138" aria-hidden="true" tabindex="-1"></a><span class="fu">max_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1139"><a href="#cb1-1139" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of warmup steps. Cannot use with warmup_ratio</span></span>
+<span id="cb1-1140"><a href="#cb1-1140" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1141"><a href="#cb1-1141" aria-hidden="true" tabindex="-1"></a><span class="co"># Warmup ratio. Cannot use with warmup_steps</span></span>
+<span id="cb1-1142"><a href="#cb1-1142" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1143"><a href="#cb1-1143" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to eval at each epoch, integer for every N steps. float for fraction of</span></span>
+<span id="cb1-1144"><a href="#cb1-1144" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
+<span id="cb1-1145"><a href="#cb1-1145" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
+<span id="cb1-1146"><a href="#cb1-1146" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to run evals, mutually exclusive with eval_steps</span></span>
+<span id="cb1-1147"><a href="#cb1-1147" aria-hidden="true" tabindex="-1"></a><span class="fu">evals_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1148"><a href="#cb1-1148" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer</span></span>
+<span id="cb1-1149"><a href="#cb1-1149" aria-hidden="true" tabindex="-1"></a><span class="co"># from `eval_steps`</span></span>
+<span id="cb1-1150"><a href="#cb1-1150" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1151"><a href="#cb1-1151" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1152"><a href="#cb1-1152" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to save at each epoch, integer for every N steps. float for fraction of</span></span>
+<span id="cb1-1153"><a href="#cb1-1153" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
+<span id="cb1-1154"><a href="#cb1-1154" aria-hidden="true" tabindex="-1"></a><span class="fu">save_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
+<span id="cb1-1155"><a href="#cb1-1155" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to save a checkpoint, mutually exclusive with save_steps</span></span>
+<span id="cb1-1156"><a href="#cb1-1156" aria-hidden="true" tabindex="-1"></a><span class="fu">saves_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1157"><a href="#cb1-1157" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better</span></span>
+<span id="cb1-1158"><a href="#cb1-1158" aria-hidden="true" tabindex="-1"></a><span class="co"># result is achieved, leave empty to infer from `save_steps`</span></span>
+<span id="cb1-1159"><a href="#cb1-1159" aria-hidden="true" tabindex="-1"></a><span class="fu">save_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1160"><a href="#cb1-1160" aria-hidden="true" tabindex="-1"></a><span class="co"># Checkpoints saved at a time</span></span>
+<span id="cb1-1161"><a href="#cb1-1161" aria-hidden="true" tabindex="-1"></a><span class="fu">save_total_limit</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1162"><a href="#cb1-1162" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to checkpoint a model after the first step of training. Defaults to False.</span></span>
+<span id="cb1-1163"><a href="#cb1-1163" aria-hidden="true" tabindex="-1"></a><span class="fu">save_first_step</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1164"><a href="#cb1-1164" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1165"><a href="#cb1-1165" aria-hidden="true" tabindex="-1"></a><span class="co"># Logging frequency</span></span>
+<span id="cb1-1166"><a href="#cb1-1166" aria-hidden="true" tabindex="-1"></a><span class="fu">logging_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1167"><a href="#cb1-1167" aria-hidden="true" tabindex="-1"></a><span class="co"># Stop training after this many evaluation losses have increased in a row. https://huggi</span></span>
+<span id="cb1-1168"><a href="#cb1-1168" aria-hidden="true" tabindex="-1"></a><span class="co"># ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin</span></span>
+<span id="cb1-1169"><a href="#cb1-1169" aria-hidden="true" tabindex="-1"></a><span class="co"># gCallback</span></span>
+<span id="cb1-1170"><a href="#cb1-1170" aria-hidden="true" tabindex="-1"></a><span class="fu">early_stopping_patience</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1171"><a href="#cb1-1171" aria-hidden="true" tabindex="-1"></a><span class="fu">load_best_model_at_end</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1172"><a href="#cb1-1172" aria-hidden="true" tabindex="-1"></a><span class="co"># Save only the model weights, skipping the optimizer. Using this means you can't resume</span></span>
+<span id="cb1-1173"><a href="#cb1-1173" aria-hidden="true" tabindex="-1"></a><span class="co"># from checkpoints.</span></span>
+<span id="cb1-1174"><a href="#cb1-1174" aria-hidden="true" tabindex="-1"></a><span class="fu">save_only_model</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1175"><a href="#cb1-1175" aria-hidden="true" tabindex="-1"></a><span class="co"># Use tensorboard for logging</span></span>
+<span id="cb1-1176"><a href="#cb1-1176" aria-hidden="true" tabindex="-1"></a><span class="fu">use_tensorboard</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1177"><a href="#cb1-1177" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable the pytorch profiler to capture the first N steps of training to the</span></span>
+<span id="cb1-1178"><a href="#cb1-1178" aria-hidden="true" tabindex="-1"></a><span class="co"># output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more</span></span>
+<span id="cb1-1179"><a href="#cb1-1179" aria-hidden="true" tabindex="-1"></a><span class="co"># information. Snapshots can be visualized @ https://pytorch.org/memory_viz</span></span>
+<span id="cb1-1180"><a href="#cb1-1180" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1181"><a href="#cb1-1181" aria-hidden="true" tabindex="-1"></a><span class="co"># Which step to start the profiler at. Useful for only capturing a few steps mid-run.</span></span>
+<span id="cb1-1182"><a href="#cb1-1182" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps_start</span><span class="kw">:</span><span class="at"> int | None = 0</span></span>
+<span id="cb1-1183"><a href="#cb1-1183" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second at the end of training. This is not</span></span>
+<span id="cb1-1184"><a href="#cb1-1184" aria-hidden="true" tabindex="-1"></a><span class="co"># supported with pre-training datasets.</span></span>
+<span id="cb1-1185"><a href="#cb1-1185" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tokens_per_second</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1186"><a href="#cb1-1186" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second per-gpu during training by measuring</span></span>
+<span id="cb1-1187"><a href="#cb1-1187" aria-hidden="true" tabindex="-1"></a><span class="co"># throughput of non-padding tokens.</span></span>
+<span id="cb1-1188"><a href="#cb1-1188" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tkps</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1189"><a href="#cb1-1189" aria-hidden="true" tabindex="-1"></a><span class="co"># NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to</span></span>
+<span id="cb1-1190"><a href="#cb1-1190" aria-hidden="true" tabindex="-1"></a><span class="co"># add noise to embeddings. Currently only supported on Llama and Mistral</span></span>
+<span id="cb1-1191"><a href="#cb1-1191" aria-hidden="true" tabindex="-1"></a><span class="fu">neftune_noise_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1192"><a href="#cb1-1192" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1193"><a href="#cb1-1193" aria-hidden="true" tabindex="-1"></a><span class="co"># Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to</span></span>
+<span id="cb1-1194"><a href="#cb1-1194" aria-hidden="true" tabindex="-1"></a><span class="co"># `beta` in `ORPOConfig` due to trl mapping.</span></span>
+<span id="cb1-1195"><a href="#cb1-1195" aria-hidden="true" tabindex="-1"></a><span class="fu">orpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1196"><a href="#cb1-1196" aria-hidden="true" tabindex="-1"></a><span class="co"># Target reward margin for the SimPO loss</span></span>
+<span id="cb1-1197"><a href="#cb1-1197" aria-hidden="true" tabindex="-1"></a><span class="fu">simpo_gamma</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1198"><a href="#cb1-1198" aria-hidden="true" tabindex="-1"></a><span class="co"># Weight of the BC regularizer</span></span>
+<span id="cb1-1199"><a href="#cb1-1199" aria-hidden="true" tabindex="-1"></a><span class="fu">cpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1200"><a href="#cb1-1200" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1201"><a href="#cb1-1201" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for desirable loss term in KTO loss</span></span>
+<span id="cb1-1202"><a href="#cb1-1202" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_desirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1203"><a href="#cb1-1203" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for undesirable loss term in KTO loss</span></span>
+<span id="cb1-1204"><a href="#cb1-1204" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_undesirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1205"><a href="#cb1-1205" aria-hidden="true" tabindex="-1"></a><span class="co"># The beta parameter for the RL training</span></span>
+<span id="cb1-1206"><a href="#cb1-1206" aria-hidden="true" tabindex="-1"></a><span class="fu">rl_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1207"><a href="#cb1-1207" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1208"><a href="#cb1-1208" aria-hidden="true" tabindex="-1"></a><span class="co"># Defines the max memory usage per gpu on the system. Passed through to transformers</span></span>
+<span id="cb1-1209"><a href="#cb1-1209" aria-hidden="true" tabindex="-1"></a><span class="co"># when loading the model.</span></span>
+<span id="cb1-1210"><a href="#cb1-1210" aria-hidden="true" tabindex="-1"></a><span class="fu">max_memory</span><span class="kw">:</span><span class="at"> dict[int | Literal['cpu', 'disk'], int | str] | None</span></span>
+<span id="cb1-1211"><a href="#cb1-1211" aria-hidden="true" tabindex="-1"></a><span class="co"># Limit the memory for all available GPUs to this amount (if an integer, expressed in</span></span>
+<span id="cb1-1212"><a href="#cb1-1212" aria-hidden="true" tabindex="-1"></a><span class="co"># gigabytes); default: unset</span></span>
+<span id="cb1-1213"><a href="#cb1-1213" aria-hidden="true" tabindex="-1"></a><span class="fu">gpu_memory_limit</span><span class="kw">:</span><span class="at"> int | str | None</span></span>
+<span id="cb1-1214"><a href="#cb1-1214" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use low_cpu_mem_usage</span></span>
+<span id="cb1-1215"><a href="#cb1-1215" aria-hidden="true" tabindex="-1"></a><span class="fu">low_cpu_mem_usage</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1216"><a href="#cb1-1216" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1217"><a href="#cb1-1217" aria-hidden="true" tabindex="-1"></a><span class="co"># The name of the chat template to use for training, following values are supported:</span></span>
+<span id="cb1-1218"><a href="#cb1-1218" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default: Uses the chat template that is available in the</span></span>
+<span id="cb1-1219"><a href="#cb1-1219" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_config.json. If the chat template is not available in the tokenizer, it will</span></span>
+<span id="cb1-1220"><a href="#cb1-1220" aria-hidden="true" tabindex="-1"></a><span class="co"># raise an error. This is the default value.</span></span>
+<span id="cb1-1221"><a href="#cb1-1221" aria-hidden="true" tabindex="-1"></a><span class="co"># alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
+<span id="cb1-1222"><a href="#cb1-1222" aria-hidden="true" tabindex="-1"></a><span class="co"># are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
+<span id="cb1-1223"><a href="#cb1-1223" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.</span></span>
+<span id="cb1-1224"><a href="#cb1-1224" aria-hidden="true" tabindex="-1"></a><span class="co"># E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not</span></span>
+<span id="cb1-1225"><a href="#cb1-1225" aria-hidden="true" tabindex="-1"></a><span class="co"># available in the tokenizer. jinja: Uses a custom jinja template for the chat template.</span></span>
+<span id="cb1-1226"><a href="#cb1-1226" aria-hidden="true" tabindex="-1"></a><span class="co"># The custom jinja template should be provided in the chat_template_jinja field. The</span></span>
+<span id="cb1-1227"><a href="#cb1-1227" aria-hidden="true" tabindex="-1"></a><span class="co"># selected chat template will be saved to the tokenizer_config.json for easier</span></span>
+<span id="cb1-1228"><a href="#cb1-1228" aria-hidden="true" tabindex="-1"></a><span class="co"># inferencing</span></span>
+<span id="cb1-1229"><a href="#cb1-1229" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None</span></span>
+<span id="cb1-1230"><a href="#cb1-1230" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom jinja template or path to jinja file for chat template. This will be only used</span></span>
+<span id="cb1-1231"><a href="#cb1-1231" aria-hidden="true" tabindex="-1"></a><span class="co"># if chat_template is set to `jinja` or `null` (in which case chat_template is</span></span>
+<span id="cb1-1232"><a href="#cb1-1232" aria-hidden="true" tabindex="-1"></a><span class="co"># automatically set to `jinja`). Default is null.</span></span>
+<span id="cb1-1233"><a href="#cb1-1233" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1234"><a href="#cb1-1234" aria-hidden="true" tabindex="-1"></a><span class="co"># Additional kwargs to pass to the chat template. This is useful for customizing the</span></span>
+<span id="cb1-1235"><a href="#cb1-1235" aria-hidden="true" tabindex="-1"></a><span class="co"># chat template. For example, you can pass `thinking=False` to add a generation prompt</span></span>
+<span id="cb1-1236"><a href="#cb1-1236" aria-hidden="true" tabindex="-1"></a><span class="co"># to the chat template.</span></span>
+<span id="cb1-1237"><a href="#cb1-1237" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1238"><a href="#cb1-1238" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the</span></span>
+<span id="cb1-1239"><a href="#cb1-1239" aria-hidden="true" tabindex="-1"></a><span class="co"># boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',</span></span>
+<span id="cb1-1240"><a href="#cb1-1240" aria-hidden="true" tabindex="-1"></a><span class="co"># '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is</span></span>
+<span id="cb1-1241"><a href="#cb1-1241" aria-hidden="true" tabindex="-1"></a><span class="co"># useful for templates that use multiple delimiter tokens.</span></span>
+<span id="cb1-1242"><a href="#cb1-1242" aria-hidden="true" tabindex="-1"></a><span class="fu">eot_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1243"><a href="#cb1-1243" aria-hidden="true" tabindex="-1"></a><span class="co"># Changes the default system message. Currently only supports chatml.</span></span>
+<span id="cb1-1244"><a href="#cb1-1244" aria-hidden="true" tabindex="-1"></a><span class="fu">default_system_message</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1245"><a href="#cb1-1245" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1246"><a href="#cb1-1246" aria-hidden="true" tabindex="-1"></a><span class="co"># Token index or indices to adjust embedding weights to the mean of the other tokens.</span></span>
+<span id="cb1-1247"><a href="#cb1-1247" aria-hidden="true" tabindex="-1"></a><span class="co"># This is useful when the model has untrained embeddings.</span></span>
+<span id="cb1-1248"><a href="#cb1-1248" aria-hidden="true" tabindex="-1"></a><span class="fu">fix_untrained_tokens</span><span class="kw">:</span><span class="at"> int | list[int] | None</span></span>
+<span id="cb1-1249"><a href="#cb1-1249" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1250"><a href="#cb1-1250" aria-hidden="true" tabindex="-1"></a><span class="fu">is_preprocess</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1251"><a href="#cb1-1251" aria-hidden="true" tabindex="-1"></a><span class="fu">preprocess_iterable</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1252"><a href="#cb1-1252" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1253"><a href="#cb1-1253" aria-hidden="true" tabindex="-1"></a><span class="co"># Total number of tokens - internal use</span></span>
+<span id="cb1-1254"><a href="#cb1-1254" aria-hidden="true" tabindex="-1"></a><span class="fu">total_num_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1255"><a href="#cb1-1255" aria-hidden="true" tabindex="-1"></a><span class="fu">total_supervised_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1256"><a href="#cb1-1256" aria-hidden="true" tabindex="-1"></a><span class="co"># You can set these packing optimizations AFTER starting a training at least once. The</span></span>
+<span id="cb1-1257"><a href="#cb1-1257" aria-hidden="true" tabindex="-1"></a><span class="co"># trainer will provide recommended values for these values.</span></span>
+<span id="cb1-1258"><a href="#cb1-1258" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_eff_est</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1259"><a href="#cb1-1259" aria-hidden="true" tabindex="-1"></a><span class="fu">axolotl_config_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1260"><a href="#cb1-1260" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb1-1261"><a href="#cb1-1261" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1262"><a href="#cb1-1262" aria-hidden="true" tabindex="-1"></a><span class="fu">is_llama_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1263"><a href="#cb1-1263" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on. Please note that if</span></span>
-<span id="cb1-1264"><a href="#cb1-1264" aria-hidden="true" tabindex="-1"></a><span class="co"># you set this to true, `padding_side` will be set to 'left' by default</span></span>
-<span id="cb1-1265"><a href="#cb1-1265" aria-hidden="true" tabindex="-1"></a><span class="fu">is_mistral_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1266"><a href="#cb1-1266" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1267"><a href="#cb1-1267" aria-hidden="true" tabindex="-1"></a><span class="fu">is_qwen_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1268"><a href="#cb1-1268" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1269"><a href="#cb1-1269" aria-hidden="true" tabindex="-1"></a><span class="co"># Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available</span></span>
-<span id="cb1-1270"><a href="#cb1-1270" aria-hidden="true" tabindex="-1"></a><span class="co"># plugins or doc below for more details.</span></span>
-<span id="cb1-1271"><a href="#cb1-1271" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/custom_integrations.html</span></span>
-<span id="cb1-1272"><a href="#cb1-1272" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1273"><a href="#cb1-1273" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable sample generation during training for monitoring</span></span>
-<span id="cb1-1274"><a href="#cb1-1274" aria-hidden="true" tabindex="-1"></a><span class="fu">generate_samples</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1275"><a href="#cb1-1275" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of samples to generate at each interval</span></span>
-<span id="cb1-1276"><a href="#cb1-1276" aria-hidden="true" tabindex="-1"></a><span class="fu">num_generation_samples</span><span class="kw">:</span><span class="at"> int | None = 3</span></span>
-<span id="cb1-1277"><a href="#cb1-1277" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum new tokens to generate per sample</span></span>
-<span id="cb1-1278"><a href="#cb1-1278" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None = 50</span></span>
-<span id="cb1-1279"><a href="#cb1-1279" aria-hidden="true" tabindex="-1"></a><span class="co"># Temperature for sample generation (0.0 = greedy)</span></span>
-<span id="cb1-1280"><a href="#cb1-1280" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_temperature</span><span class="kw">:</span><span class="at"> float | None = 0.7</span></span>
-<span id="cb1-1281"><a href="#cb1-1281" aria-hidden="true" tabindex="-1"></a><span class="co"># Nucleus sampling parameter for generation</span></span>
-<span id="cb1-1282"><a href="#cb1-1282" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_top_p</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1283"><a href="#cb1-1283" aria-hidden="true" tabindex="-1"></a><span class="co"># Top-k sampling parameter for generation</span></span>
-<span id="cb1-1284"><a href="#cb1-1284" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_top_k</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1285"><a href="#cb1-1285" aria-hidden="true" tabindex="-1"></a><span class="co"># Ratio of input to use as prompt (0.0-1.0)</span></span>
-<span id="cb1-1286"><a href="#cb1-1286" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_prompt_ratio</span><span class="kw">:</span><span class="at"> float | None = 0.5</span></span>
-<span id="cb1-1287"><a href="#cb1-1287" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use sampling (vs greedy decoding)</span></span>
-<span id="cb1-1288"><a href="#cb1-1288" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_do_sample</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1289"><a href="#cb1-1289" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1290"><a href="#cb1-1290" aria-hidden="true" tabindex="-1"></a><span class="co"># This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This</span></span>
-<span id="cb1-1291"><a href="#cb1-1291" aria-hidden="true" tabindex="-1"></a><span class="co"># can also be a relative path to a model on disk</span></span>
-<span id="cb1-1292"><a href="#cb1-1292" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> str (required)</span></span>
-<span id="cb1-1293"><a href="#cb1-1293" aria-hidden="true" tabindex="-1"></a><span class="co"># If the base_model repo on hf hub doesn't include configuration .json files, You can</span></span>
-<span id="cb1-1294"><a href="#cb1-1294" aria-hidden="true" tabindex="-1"></a><span class="co"># set that here, or leave this empty to default to base_model</span></span>
-<span id="cb1-1295"><a href="#cb1-1295" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1296"><a href="#cb1-1296" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to</span></span>
-<span id="cb1-1297"><a href="#cb1-1297" aria-hidden="true" tabindex="-1"></a><span class="co"># AutoConfig.</span></span>
-<span id="cb1-1298"><a href="#cb1-1298" aria-hidden="true" tabindex="-1"></a><span class="fu">cls_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1299"><a href="#cb1-1299" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional tokenizer configuration path in case you want to use a different tokenizer</span></span>
-<span id="cb1-1300"><a href="#cb1-1300" aria-hidden="true" tabindex="-1"></a><span class="co"># than the one defined in the base model</span></span>
-<span id="cb1-1301"><a href="#cb1-1301" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1302"><a href="#cb1-1302" aria-hidden="true" tabindex="-1"></a><span class="co"># use_fast option for tokenizer loading from_pretrained, default to True</span></span>
-<span id="cb1-1303"><a href="#cb1-1303" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_fast</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1304"><a href="#cb1-1304" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use the legacy tokenizer setting, defaults to True</span></span>
-<span id="cb1-1305"><a href="#cb1-1305" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_legacy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1306"><a href="#cb1-1306" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use mistral-common tokenizer. If set to True, it will use the mistral-</span></span>
-<span id="cb1-1307"><a href="#cb1-1307" aria-hidden="true" tabindex="-1"></a><span class="co"># common tokenizer.</span></span>
-<span id="cb1-1308"><a href="#cb1-1308" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_mistral_common</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1309"><a href="#cb1-1309" aria-hidden="true" tabindex="-1"></a><span class="co"># Corresponding tokenizer for the model AutoTokenizer is a good choice</span></span>
-<span id="cb1-1310"><a href="#cb1-1310" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1311"><a href="#cb1-1311" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers processor class</span></span>
-<span id="cb1-1312"><a href="#cb1-1312" aria-hidden="true" tabindex="-1"></a><span class="fu">processor_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1313"><a href="#cb1-1313" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save jinja files for tokenizer, transformers default is True</span></span>
-<span id="cb1-1314"><a href="#cb1-1314" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_save_jinja_files</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1315"><a href="#cb1-1315" aria-hidden="true" tabindex="-1"></a><span class="co"># Trust remote code for untrusted source</span></span>
-<span id="cb1-1316"><a href="#cb1-1316" aria-hidden="true" tabindex="-1"></a><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1317"><a href="#cb1-1317" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1318"><a href="#cb1-1318" aria-hidden="true" tabindex="-1"></a><span class="co"># Don't move the model to the device before sharding. Set to `false` to revert to legacy</span></span>
-<span id="cb1-1319"><a href="#cb1-1319" aria-hidden="true" tabindex="-1"></a><span class="co"># behavior.</span></span>
-<span id="cb1-1320"><a href="#cb1-1320" aria-hidden="true" tabindex="-1"></a><span class="fu">experimental_skip_move_to_device</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1321"><a href="#cb1-1321" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1322"><a href="#cb1-1322" aria-hidden="true" tabindex="-1"></a><span class="co"># Use custom kernels, e.g. MegaBlocks.</span></span>
-<span id="cb1-1323"><a href="#cb1-1323" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1324"><a href="#cb1-1324" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1325"><a href="#cb1-1325" aria-hidden="true" tabindex="-1"></a><span class="co"># Model loading quantization config</span></span>
-<span id="cb1-1326"><a href="#cb1-1326" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config</span><span class="kw">:</span><span class="at"> Literal['Mxfp4Config'] | None</span></span>
-<span id="cb1-1327"><a href="#cb1-1327" aria-hidden="true" tabindex="-1"></a><span class="co"># kwargs for model quantization config</span></span>
-<span id="cb1-1328"><a href="#cb1-1328" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1329"><a href="#cb1-1329" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1330"><a href="#cb1-1330" aria-hidden="true" tabindex="-1"></a><span class="co"># Where to save the full-finetuned model to</span></span>
-<span id="cb1-1331"><a href="#cb1-1331" aria-hidden="true" tabindex="-1"></a><span class="fu">output_dir</span><span class="kw">:</span><span class="at"> str = ./model-out</span></span>
-<span id="cb1-1332"><a href="#cb1-1332" aria-hidden="true" tabindex="-1"></a><span class="co"># push checkpoints to hub</span></span>
-<span id="cb1-1333"><a href="#cb1-1333" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_model_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1334"><a href="#cb1-1334" aria-hidden="true" tabindex="-1"></a><span class="co"># how to push checkpoints to hub</span></span>
-<span id="cb1-1335"><a href="#cb1-1335" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1336"><a href="#cb1-1336" aria-hidden="true" tabindex="-1"></a><span class="co"># branch/revision to push to on hub (default: main)</span></span>
-<span id="cb1-1337"><a href="#cb1-1337" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1338"><a href="#cb1-1338" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save the model using safetensors format. Defaults to True.</span></span>
-<span id="cb1-1339"><a href="#cb1-1339" aria-hidden="true" tabindex="-1"></a><span class="fu">save_safetensors</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1340"><a href="#cb1-1340" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1341"><a href="#cb1-1341" aria-hidden="true" tabindex="-1"></a><span class="co"># This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer</span></span>
-<span id="cb1-1342"><a href="#cb1-1342" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_8bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1343"><a href="#cb1-1343" aria-hidden="true" tabindex="-1"></a><span class="co"># Use bitsandbytes 4 bit</span></span>
-<span id="cb1-1344"><a href="#cb1-1344" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_4bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1345"><a href="#cb1-1345" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1346"><a href="#cb1-1346" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all</span></span>
-<span id="cb1-1347"><a href="#cb1-1347" aria-hidden="true" tabindex="-1"></a><span class="co"># parameters in original model</span></span>
-<span id="cb1-1348"><a href="#cb1-1348" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> Literal['lora', 'qlora', 'llama-adapter'] | None</span></span>
-<span id="cb1-1349"><a href="#cb1-1349" aria-hidden="true" tabindex="-1"></a><span class="co"># If you already have a lora model trained that you want to load, put that here. This</span></span>
-<span id="cb1-1350"><a href="#cb1-1350" aria-hidden="true" tabindex="-1"></a><span class="co"># means after training, if you want to test the model, you should set this to the value</span></span>
-<span id="cb1-1351"><a href="#cb1-1351" aria-hidden="true" tabindex="-1"></a><span class="co"># of `output_dir`. Note that if you merge an adapter to the base model, a new</span></span>
-<span id="cb1-1352"><a href="#cb1-1352" aria-hidden="true" tabindex="-1"></a><span class="co"># subdirectory `merged` will be created under the `output_dir`.</span></span>
-<span id="cb1-1353"><a href="#cb1-1353" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_model_dir</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1354"><a href="#cb1-1354" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1355"><a href="#cb1-1355" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1356"><a href="#cb1-1356" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_fan_in_fan_out</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1357"><a href="#cb1-1357" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_modules</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
-<span id="cb1-1358"><a href="#cb1-1358" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_parameters</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
-<span id="cb1-1359"><a href="#cb1-1359" aria-hidden="true" tabindex="-1"></a><span class="co"># If true, will target all linear modules</span></span>
-<span id="cb1-1360"><a href="#cb1-1360" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_linear</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1361"><a href="#cb1-1361" aria-hidden="true" tabindex="-1"></a><span class="co"># If you added new tokens to the tokenizer, you may need to save some LoRA modules</span></span>
-<span id="cb1-1362"><a href="#cb1-1362" aria-hidden="true" tabindex="-1"></a><span class="co"># because they need to know the new tokens. For LLaMA and Mistral, you need to save</span></span>
-<span id="cb1-1363"><a href="#cb1-1363" aria-hidden="true" tabindex="-1"></a><span class="co"># `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts</span></span>
-<span id="cb1-1364"><a href="#cb1-1364" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens to embeddings, and `lm_head` converts embeddings to token probabilities.</span></span>
-<span id="cb1-1365"><a href="#cb1-1365" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_modules_to_save</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1366"><a href="#cb1-1366" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_dropout</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-1367"><a href="#cb1-1367" aria-hidden="true" tabindex="-1"></a><span class="co"># The layer indices to transform, otherwise, apply to all layers</span></span>
-<span id="cb1-1368"><a href="#cb1-1368" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_to_transform</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
-<span id="cb1-1369"><a href="#cb1-1369" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_pattern</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1370"><a href="#cb1-1370" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1371"><a href="#cb1-1371" aria-hidden="true" tabindex="-1"></a><span class="fu">peft</span><span class="kw">:</span><span class="at"> PeftConfig | None</span></span>
-<span id="cb1-1372"><a href="#cb1-1372" aria-hidden="true" tabindex="-1"></a><span class="co">  # For PeftConfig:</span></span>
-<span id="cb1-1373"><a href="#cb1-1373" aria-hidden="true" tabindex="-1"></a><span class="co">  # Configuration options for loftq initialization for LoRA</span></span>
-<span id="cb1-1374"><a href="#cb1-1374" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">loftq_config</span><span class="kw">:</span><span class="at"> LoftQConfig | None</span></span>
-<span id="cb1-1375"><a href="#cb1-1375" aria-hidden="true" tabindex="-1"></a><span class="co">    # For LoftQConfig:</span></span>
-<span id="cb1-1376"><a href="#cb1-1376" aria-hidden="true" tabindex="-1"></a><span class="co">    # typically 4 bits</span></span>
-<span id="cb1-1377"><a href="#cb1-1377" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">loftq_bits</span><span class="kw">:</span><span class="at"> int = 4</span></span>
-<span id="cb1-1378"><a href="#cb1-1378" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1379"><a href="#cb1-1379" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use DoRA.</span></span>
-<span id="cb1-1380"><a href="#cb1-1380" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_dora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1381"><a href="#cb1-1381" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use RSLoRA.</span></span>
-<span id="cb1-1382"><a href="#cb1-1382" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_rslora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1383"><a href="#cb1-1383" aria-hidden="true" tabindex="-1"></a><span class="co"># List of layer indices to replicate.</span></span>
-<span id="cb1-1384"><a href="#cb1-1384" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layer_replication</span><span class="kw">:</span><span class="at"> list[tuple[int, int]] | None</span></span>
-<span id="cb1-1385"><a href="#cb1-1385" aria-hidden="true" tabindex="-1"></a><span class="co"># How to initialize LoRA weights. Default to True which is MS original implementation.</span></span>
-<span id="cb1-1386"><a href="#cb1-1386" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_init_lora_weights</span><span class="kw">:</span><span class="at"> bool | str | None</span></span>
-<span id="cb1-1387"><a href="#cb1-1387" aria-hidden="true" tabindex="-1"></a><span class="co"># A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict</span></span>
-<span id="cb1-1388"><a href="#cb1-1388" aria-hidden="true" tabindex="-1"></a><span class="co"># mapping an embedding layer name to its trainable token indices. See</span></span>
-<span id="cb1-1389"><a href="#cb1-1389" aria-hidden="true" tabindex="-1"></a><span class="co"># https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-</span></span>
-<span id="cb1-1390"><a href="#cb1-1390" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens-alongside-lora</span></span>
-<span id="cb1-1391"><a href="#cb1-1391" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_trainable_token_indices</span><span class="kw">:</span><span class="at"> list[int] | dict[str, list[int]] | None</span></span>
-<span id="cb1-1392"><a href="#cb1-1392" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to tie adapter weights for tied model weights. See</span></span>
-<span id="cb1-1393"><a href="#cb1-1393" aria-hidden="true" tabindex="-1"></a><span class="co"># https://github.com/huggingface/peft/issues/2864</span></span>
-<span id="cb1-1394"><a href="#cb1-1394" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_ensure_weight_tying</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1395"><a href="#cb1-1395" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.</span></span>
-<span id="cb1-1396"><a href="#cb1-1396" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_autocast_adapter_dtype</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1397"><a href="#cb1-1397" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1398"><a href="#cb1-1398" aria-hidden="true" tabindex="-1"></a><span class="co"># load qlora model in sharded format for FSDP using answer.ai technique.</span></span>
-<span id="cb1-1399"><a href="#cb1-1399" aria-hidden="true" tabindex="-1"></a><span class="fu">qlora_sharded_model_loading</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1400"><a href="#cb1-1400" aria-hidden="true" tabindex="-1"></a><span class="co"># Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it</span></span>
-<span id="cb1-1401"><a href="#cb1-1401" aria-hidden="true" tabindex="-1"></a><span class="co"># takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge</span></span>
-<span id="cb1-1402"><a href="#cb1-1402" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_on_cpu</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1403"><a href="#cb1-1403" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether you are training a 4-bit GPTQ quantized model</span></span>
-<span id="cb1-1404"><a href="#cb1-1404" aria-hidden="true" tabindex="-1"></a><span class="fu">gptq</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1405"><a href="#cb1-1405" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the bnb 4bit quantization configuration</span></span>
-<span id="cb1-1406"><a href="#cb1-1406" aria-hidden="true" tabindex="-1"></a><span class="fu">bnb_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1407"><a href="#cb1-1407" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1408"><a href="#cb1-1408" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.</span></span>
-<span id="cb1-1409"><a href="#cb1-1409" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1410"><a href="#cb1-1410" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate for lora embedding layers. Default value is 1e-6.</span></span>
-<span id="cb1-1411"><a href="#cb1-1411" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_embedding</span><span class="kw">:</span><span class="at"> float | None = 1e-06</span></span>
-<span id="cb1-1412"><a href="#cb1-1412" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1413"><a href="#cb1-1413" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_lora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1414"><a href="#cb1-1414" aria-hidden="true" tabindex="-1"></a><span class="co"># Method to use for LoRA merging. 'memory_efficient' (default) processes shards</span></span>
-<span id="cb1-1415"><a href="#cb1-1415" aria-hidden="true" tabindex="-1"></a><span class="co"># individually to reduce memory usage, 'legacy' loads the full model into memory.</span></span>
-<span id="cb1-1416"><a href="#cb1-1416" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_method</span><span class="kw">:</span><span class="at"> Literal['legacy', 'memory_efficient'] | None = memory_efficient</span></span>
-<span id="cb1-1417"><a href="#cb1-1417" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1418"><a href="#cb1-1418" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ReLoRA. Use with jagged_restart_*steps options.</span></span>
-<span id="cb1-1419"><a href="#cb1-1419" aria-hidden="true" tabindex="-1"></a><span class="fu">relora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1420"><a href="#cb1-1420" aria-hidden="true" tabindex="-1"></a><span class="co"># threshold for optimizer magnitude when pruning</span></span>
-<span id="cb1-1421"><a href="#cb1-1421" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_prune_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1422"><a href="#cb1-1422" aria-hidden="true" tabindex="-1"></a><span class="co"># True to perform lora weight merges on cpu during restarts, for modest gpu memory</span></span>
-<span id="cb1-1423"><a href="#cb1-1423" aria-hidden="true" tabindex="-1"></a><span class="co"># savings</span></span>
-<span id="cb1-1424"><a href="#cb1-1424" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_cpu_offload</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1425"><a href="#cb1-1425" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1426"><a href="#cb1-1426" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to reset for jagged restarts</span></span>
-<span id="cb1-1427"><a href="#cb1-1427" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1428"><a href="#cb1-1428" aria-hidden="true" tabindex="-1"></a><span class="co"># how many warmup steps to take after reset for jagged restarts</span></span>
-<span id="cb1-1429"><a href="#cb1-1429" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1430"><a href="#cb1-1430" aria-hidden="true" tabindex="-1"></a><span class="co"># how many anneal steps to take before reset for jagged restarts</span></span>
-<span id="cb1-1431"><a href="#cb1-1431" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_anneal_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1432"><a href="#cb1-1432" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1433"><a href="#cb1-1433" aria-hidden="true" tabindex="-1"></a><span class="co"># If greater than 1, backpropagation will be skipped and the gradients will be</span></span>
-<span id="cb1-1434"><a href="#cb1-1434" aria-hidden="true" tabindex="-1"></a><span class="co"># accumulated for the given number of steps.</span></span>
-<span id="cb1-1435"><a href="#cb1-1435" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_accumulation_steps</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1436"><a href="#cb1-1436" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples to include in each batch. This is the number of samples sent to</span></span>
-<span id="cb1-1437"><a href="#cb1-1437" aria-hidden="true" tabindex="-1"></a><span class="co"># each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps</span></span>
-<span id="cb1-1438"><a href="#cb1-1438" aria-hidden="true" tabindex="-1"></a><span class="fu">micro_batch_size</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1439"><a href="#cb1-1439" aria-hidden="true" tabindex="-1"></a><span class="co"># Total batch size, we do not recommended setting this manually</span></span>
-<span id="cb1-1440"><a href="#cb1-1440" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1441"><a href="#cb1-1441" aria-hidden="true" tabindex="-1"></a><span class="co"># per gpu micro batch size for evals, defaults to value of micro_batch_size</span></span>
-<span id="cb1-1442"><a href="#cb1-1442" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1443"><a href="#cb1-1443" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1444"><a href="#cb1-1444" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to find batch size that fits in memory. Passed to underlying transformers</span></span>
-<span id="cb1-1445"><a href="#cb1-1445" aria-hidden="true" tabindex="-1"></a><span class="co"># Trainer</span></span>
-<span id="cb1-1446"><a href="#cb1-1446" aria-hidden="true" tabindex="-1"></a><span class="fu">auto_find_batch_size</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1447"><a href="#cb1-1447" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1448"><a href="#cb1-1448" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to mask out or include the human's prompt from the training labels</span></span>
-<span id="cb1-1449"><a href="#cb1-1449" aria-hidden="true" tabindex="-1"></a><span class="fu">train_on_inputs</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1450"><a href="#cb1-1450" aria-hidden="true" tabindex="-1"></a><span class="co"># Group similarly sized data to minimize padding. May be slower to start, as it must</span></span>
-<span id="cb1-1451"><a href="#cb1-1451" aria-hidden="true" tabindex="-1"></a><span class="co"># download and sort the entire dataset. Note that training loss may have an oscillating</span></span>
-<span id="cb1-1452"><a href="#cb1-1452" aria-hidden="true" tabindex="-1"></a><span class="co"># pattern with this enabled.</span></span>
-<span id="cb1-1453"><a href="#cb1-1453" aria-hidden="true" tabindex="-1"></a><span class="fu">group_by_length</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1454"><a href="#cb1-1454" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1455"><a href="#cb1-1455" aria-hidden="true" tabindex="-1"></a><span class="fu">learning_rate</span><span class="kw">:</span><span class="at"> str | float (required)</span></span>
-<span id="cb1-1456"><a href="#cb1-1456" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1457"><a href="#cb1-1457" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr_scale</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1458"><a href="#cb1-1458" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify weight decay</span></span>
-<span id="cb1-1459"><a href="#cb1-1459" aria-hidden="true" tabindex="-1"></a><span class="fu">weight_decay</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-1460"><a href="#cb1-1460" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify optimizer</span></span>
-<span id="cb1-1461"><a href="#cb1-1461" aria-hidden="true" tabindex="-1"></a><span class="fu">optimizer</span><span class="kw">:</span><span class="at"> OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED</span></span>
-<span id="cb1-1462"><a href="#cb1-1462" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary of arguments to pass to the optimizer</span></span>
-<span id="cb1-1463"><a href="#cb1-1463" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_args</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
-<span id="cb1-1464"><a href="#cb1-1464" aria-hidden="true" tabindex="-1"></a><span class="co"># The target modules to optimize, i.e. the module names that you would like to train,</span></span>
-<span id="cb1-1465"><a href="#cb1-1465" aria-hidden="true" tabindex="-1"></a><span class="co"># right now this is used only for GaLore algorithm</span></span>
-<span id="cb1-1466"><a href="#cb1-1466" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_target_modules</span><span class="kw">:</span><span class="at"> list[str] | Literal['all_linear'] | None</span></span>
-<span id="cb1-1467"><a href="#cb1-1467" aria-hidden="true" tabindex="-1"></a><span class="co"># Path to torch distx for optim 'adamw_anyprecision'</span></span>
-<span id="cb1-1468"><a href="#cb1-1468" aria-hidden="true" tabindex="-1"></a><span class="fu">torchdistx_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1469"><a href="#cb1-1469" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler</span><span class="kw">:</span><span class="at"> SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE</span></span>
-<span id="cb1-1470"><a href="#cb1-1470" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a scheduler and kwargs to use with the optimizer</span></span>
-<span id="cb1-1471"><a href="#cb1-1471" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1472"><a href="#cb1-1472" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_quadratic_warmup</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1473"><a href="#cb1-1473" aria-hidden="true" tabindex="-1"></a><span class="co"># decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of</span></span>
-<span id="cb1-1474"><a href="#cb1-1474" aria-hidden="true" tabindex="-1"></a><span class="co"># peak lr</span></span>
-<span id="cb1-1475"><a href="#cb1-1475" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_min_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1476"><a href="#cb1-1476" aria-hidden="true" tabindex="-1"></a><span class="co"># freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means</span></span>
-<span id="cb1-1477"><a href="#cb1-1477" aria-hidden="true" tabindex="-1"></a><span class="co"># start cosine_min_lr at 80% of training step</span></span>
-<span id="cb1-1478"><a href="#cb1-1478" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_constant_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1479"><a href="#cb1-1479" aria-hidden="true" tabindex="-1"></a><span class="co"># Learning rate div factor</span></span>
-<span id="cb1-1480"><a href="#cb1-1480" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_div_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1481"><a href="#cb1-1481" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1482"><a href="#cb1-1482" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_groups</span><span class="kw">:</span><span class="at"> list[LrGroup] | None</span></span>
-<span id="cb1-1483"><a href="#cb1-1483" aria-hidden="true" tabindex="-1"></a><span class="co">  # For LrGroup:</span></span>
-<span id="cb1-1484"><a href="#cb1-1484" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str (required)</span></span>
-<span id="cb1-1485"><a href="#cb1-1485" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">modules</span><span class="kw">:</span><span class="at"> list[str] (required)</span></span>
-<span id="cb1-1486"><a href="#cb1-1486" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">lr</span><span class="kw">:</span><span class="at"> float (required)</span></span>
-<span id="cb1-1487"><a href="#cb1-1487" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1488"><a href="#cb1-1488" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1489"><a href="#cb1-1489" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1490"><a href="#cb1-1490" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
-<span id="cb1-1491"><a href="#cb1-1491" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon2</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1492"><a href="#cb1-1492" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1493"><a href="#cb1-1493" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta1</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1262"><a href="#cb1-1262" aria-hidden="true" tabindex="-1"></a><span class="fu">is_falcon_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1263"><a href="#cb1-1263" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
+<span id="cb1-1264"><a href="#cb1-1264" aria-hidden="true" tabindex="-1"></a><span class="fu">is_llama_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1265"><a href="#cb1-1265" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on. Please note that if</span></span>
+<span id="cb1-1266"><a href="#cb1-1266" aria-hidden="true" tabindex="-1"></a><span class="co"># you set this to true, `padding_side` will be set to 'left' by default</span></span>
+<span id="cb1-1267"><a href="#cb1-1267" aria-hidden="true" tabindex="-1"></a><span class="fu">is_mistral_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1268"><a href="#cb1-1268" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
+<span id="cb1-1269"><a href="#cb1-1269" aria-hidden="true" tabindex="-1"></a><span class="fu">is_qwen_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1270"><a href="#cb1-1270" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1271"><a href="#cb1-1271" aria-hidden="true" tabindex="-1"></a><span class="co"># Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available</span></span>
+<span id="cb1-1272"><a href="#cb1-1272" aria-hidden="true" tabindex="-1"></a><span class="co"># plugins or doc below for more details.</span></span>
+<span id="cb1-1273"><a href="#cb1-1273" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/custom_integrations.html</span></span>
+<span id="cb1-1274"><a href="#cb1-1274" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1275"><a href="#cb1-1275" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable sample generation during training for monitoring</span></span>
+<span id="cb1-1276"><a href="#cb1-1276" aria-hidden="true" tabindex="-1"></a><span class="fu">generate_samples</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1277"><a href="#cb1-1277" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of samples to generate at each interval</span></span>
+<span id="cb1-1278"><a href="#cb1-1278" aria-hidden="true" tabindex="-1"></a><span class="fu">num_generation_samples</span><span class="kw">:</span><span class="at"> int | None = 3</span></span>
+<span id="cb1-1279"><a href="#cb1-1279" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum new tokens to generate per sample</span></span>
+<span id="cb1-1280"><a href="#cb1-1280" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None = 50</span></span>
+<span id="cb1-1281"><a href="#cb1-1281" aria-hidden="true" tabindex="-1"></a><span class="co"># Temperature for sample generation (0.0 = greedy)</span></span>
+<span id="cb1-1282"><a href="#cb1-1282" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_temperature</span><span class="kw">:</span><span class="at"> float | None = 0.7</span></span>
+<span id="cb1-1283"><a href="#cb1-1283" aria-hidden="true" tabindex="-1"></a><span class="co"># Nucleus sampling parameter for generation</span></span>
+<span id="cb1-1284"><a href="#cb1-1284" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_top_p</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1285"><a href="#cb1-1285" aria-hidden="true" tabindex="-1"></a><span class="co"># Top-k sampling parameter for generation</span></span>
+<span id="cb1-1286"><a href="#cb1-1286" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_top_k</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1287"><a href="#cb1-1287" aria-hidden="true" tabindex="-1"></a><span class="co"># Ratio of input to use as prompt (0.0-1.0)</span></span>
+<span id="cb1-1288"><a href="#cb1-1288" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_prompt_ratio</span><span class="kw">:</span><span class="at"> float | None = 0.5</span></span>
+<span id="cb1-1289"><a href="#cb1-1289" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use sampling (vs greedy decoding)</span></span>
+<span id="cb1-1290"><a href="#cb1-1290" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_do_sample</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1291"><a href="#cb1-1291" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1292"><a href="#cb1-1292" aria-hidden="true" tabindex="-1"></a><span class="co"># This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This</span></span>
+<span id="cb1-1293"><a href="#cb1-1293" aria-hidden="true" tabindex="-1"></a><span class="co"># can also be a relative path to a model on disk</span></span>
+<span id="cb1-1294"><a href="#cb1-1294" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> str (required)</span></span>
+<span id="cb1-1295"><a href="#cb1-1295" aria-hidden="true" tabindex="-1"></a><span class="co"># If the base_model repo on hf hub doesn't include configuration .json files, You can</span></span>
+<span id="cb1-1296"><a href="#cb1-1296" aria-hidden="true" tabindex="-1"></a><span class="co"># set that here, or leave this empty to default to base_model</span></span>
+<span id="cb1-1297"><a href="#cb1-1297" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1298"><a href="#cb1-1298" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to</span></span>
+<span id="cb1-1299"><a href="#cb1-1299" aria-hidden="true" tabindex="-1"></a><span class="co"># AutoConfig.</span></span>
+<span id="cb1-1300"><a href="#cb1-1300" aria-hidden="true" tabindex="-1"></a><span class="fu">cls_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1301"><a href="#cb1-1301" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional tokenizer configuration path in case you want to use a different tokenizer</span></span>
+<span id="cb1-1302"><a href="#cb1-1302" aria-hidden="true" tabindex="-1"></a><span class="co"># than the one defined in the base model</span></span>
+<span id="cb1-1303"><a href="#cb1-1303" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1304"><a href="#cb1-1304" aria-hidden="true" tabindex="-1"></a><span class="co"># use_fast option for tokenizer loading from_pretrained, default to True</span></span>
+<span id="cb1-1305"><a href="#cb1-1305" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_fast</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1306"><a href="#cb1-1306" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use the legacy tokenizer setting, defaults to True</span></span>
+<span id="cb1-1307"><a href="#cb1-1307" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_legacy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1308"><a href="#cb1-1308" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use mistral-common tokenizer. If set to True, it will use the mistral-</span></span>
+<span id="cb1-1309"><a href="#cb1-1309" aria-hidden="true" tabindex="-1"></a><span class="co"># common tokenizer.</span></span>
+<span id="cb1-1310"><a href="#cb1-1310" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_mistral_common</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1311"><a href="#cb1-1311" aria-hidden="true" tabindex="-1"></a><span class="co"># Corresponding tokenizer for the model AutoTokenizer is a good choice</span></span>
+<span id="cb1-1312"><a href="#cb1-1312" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1313"><a href="#cb1-1313" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers processor class</span></span>
+<span id="cb1-1314"><a href="#cb1-1314" aria-hidden="true" tabindex="-1"></a><span class="fu">processor_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1315"><a href="#cb1-1315" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save jinja files for tokenizer, transformers default is True</span></span>
+<span id="cb1-1316"><a href="#cb1-1316" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_save_jinja_files</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1317"><a href="#cb1-1317" aria-hidden="true" tabindex="-1"></a><span class="co"># Trust remote code for untrusted source</span></span>
+<span id="cb1-1318"><a href="#cb1-1318" aria-hidden="true" tabindex="-1"></a><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1319"><a href="#cb1-1319" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1320"><a href="#cb1-1320" aria-hidden="true" tabindex="-1"></a><span class="co"># Don't move the model to the device before sharding. Set to `false` to revert to legacy</span></span>
+<span id="cb1-1321"><a href="#cb1-1321" aria-hidden="true" tabindex="-1"></a><span class="co"># behavior.</span></span>
+<span id="cb1-1322"><a href="#cb1-1322" aria-hidden="true" tabindex="-1"></a><span class="fu">experimental_skip_move_to_device</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1323"><a href="#cb1-1323" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1324"><a href="#cb1-1324" aria-hidden="true" tabindex="-1"></a><span class="co"># Use custom kernels, e.g. MegaBlocks.</span></span>
+<span id="cb1-1325"><a href="#cb1-1325" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1326"><a href="#cb1-1326" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1327"><a href="#cb1-1327" aria-hidden="true" tabindex="-1"></a><span class="co"># Model loading quantization config</span></span>
+<span id="cb1-1328"><a href="#cb1-1328" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config</span><span class="kw">:</span><span class="at"> Literal['Mxfp4Config'] | None</span></span>
+<span id="cb1-1329"><a href="#cb1-1329" aria-hidden="true" tabindex="-1"></a><span class="co"># kwargs for model quantization config</span></span>
+<span id="cb1-1330"><a href="#cb1-1330" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1331"><a href="#cb1-1331" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1332"><a href="#cb1-1332" aria-hidden="true" tabindex="-1"></a><span class="co"># Where to save the full-finetuned model to</span></span>
+<span id="cb1-1333"><a href="#cb1-1333" aria-hidden="true" tabindex="-1"></a><span class="fu">output_dir</span><span class="kw">:</span><span class="at"> str = ./model-out</span></span>
+<span id="cb1-1334"><a href="#cb1-1334" aria-hidden="true" tabindex="-1"></a><span class="co"># push checkpoints to hub</span></span>
+<span id="cb1-1335"><a href="#cb1-1335" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_model_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1336"><a href="#cb1-1336" aria-hidden="true" tabindex="-1"></a><span class="co"># how to push checkpoints to hub</span></span>
+<span id="cb1-1337"><a href="#cb1-1337" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1338"><a href="#cb1-1338" aria-hidden="true" tabindex="-1"></a><span class="co"># branch/revision to push to on hub (default: main)</span></span>
+<span id="cb1-1339"><a href="#cb1-1339" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1340"><a href="#cb1-1340" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save the model using safetensors format. Defaults to True.</span></span>
+<span id="cb1-1341"><a href="#cb1-1341" aria-hidden="true" tabindex="-1"></a><span class="fu">save_safetensors</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1342"><a href="#cb1-1342" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1343"><a href="#cb1-1343" aria-hidden="true" tabindex="-1"></a><span class="co"># This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer</span></span>
+<span id="cb1-1344"><a href="#cb1-1344" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_8bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1345"><a href="#cb1-1345" aria-hidden="true" tabindex="-1"></a><span class="co"># Use bitsandbytes 4 bit</span></span>
+<span id="cb1-1346"><a href="#cb1-1346" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_4bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1347"><a href="#cb1-1347" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1348"><a href="#cb1-1348" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all</span></span>
+<span id="cb1-1349"><a href="#cb1-1349" aria-hidden="true" tabindex="-1"></a><span class="co"># parameters in original model</span></span>
+<span id="cb1-1350"><a href="#cb1-1350" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> Literal['lora', 'qlora', 'llama-adapter'] | None</span></span>
+<span id="cb1-1351"><a href="#cb1-1351" aria-hidden="true" tabindex="-1"></a><span class="co"># If you already have a lora model trained that you want to load, put that here. This</span></span>
+<span id="cb1-1352"><a href="#cb1-1352" aria-hidden="true" tabindex="-1"></a><span class="co"># means after training, if you want to test the model, you should set this to the value</span></span>
+<span id="cb1-1353"><a href="#cb1-1353" aria-hidden="true" tabindex="-1"></a><span class="co"># of `output_dir`. Note that if you merge an adapter to the base model, a new</span></span>
+<span id="cb1-1354"><a href="#cb1-1354" aria-hidden="true" tabindex="-1"></a><span class="co"># subdirectory `merged` will be created under the `output_dir`.</span></span>
+<span id="cb1-1355"><a href="#cb1-1355" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_model_dir</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1356"><a href="#cb1-1356" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1357"><a href="#cb1-1357" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1358"><a href="#cb1-1358" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_fan_in_fan_out</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1359"><a href="#cb1-1359" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_modules</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
+<span id="cb1-1360"><a href="#cb1-1360" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_parameters</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
+<span id="cb1-1361"><a href="#cb1-1361" aria-hidden="true" tabindex="-1"></a><span class="co"># If true, will target all linear modules</span></span>
+<span id="cb1-1362"><a href="#cb1-1362" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_linear</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1363"><a href="#cb1-1363" aria-hidden="true" tabindex="-1"></a><span class="co"># If you added new tokens to the tokenizer, you may need to save some LoRA modules</span></span>
+<span id="cb1-1364"><a href="#cb1-1364" aria-hidden="true" tabindex="-1"></a><span class="co"># because they need to know the new tokens. For LLaMA and Mistral, you need to save</span></span>
+<span id="cb1-1365"><a href="#cb1-1365" aria-hidden="true" tabindex="-1"></a><span class="co"># `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts</span></span>
+<span id="cb1-1366"><a href="#cb1-1366" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens to embeddings, and `lm_head` converts embeddings to token probabilities.</span></span>
+<span id="cb1-1367"><a href="#cb1-1367" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_modules_to_save</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1368"><a href="#cb1-1368" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_dropout</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-1369"><a href="#cb1-1369" aria-hidden="true" tabindex="-1"></a><span class="co"># The layer indices to transform, otherwise, apply to all layers</span></span>
+<span id="cb1-1370"><a href="#cb1-1370" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_to_transform</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
+<span id="cb1-1371"><a href="#cb1-1371" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_pattern</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1372"><a href="#cb1-1372" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1373"><a href="#cb1-1373" aria-hidden="true" tabindex="-1"></a><span class="fu">peft</span><span class="kw">:</span><span class="at"> PeftConfig | None</span></span>
+<span id="cb1-1374"><a href="#cb1-1374" aria-hidden="true" tabindex="-1"></a><span class="co">  # For PeftConfig:</span></span>
+<span id="cb1-1375"><a href="#cb1-1375" aria-hidden="true" tabindex="-1"></a><span class="co">  # Configuration options for loftq initialization for LoRA</span></span>
+<span id="cb1-1376"><a href="#cb1-1376" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">loftq_config</span><span class="kw">:</span><span class="at"> LoftQConfig | None</span></span>
+<span id="cb1-1377"><a href="#cb1-1377" aria-hidden="true" tabindex="-1"></a><span class="co">    # For LoftQConfig:</span></span>
+<span id="cb1-1378"><a href="#cb1-1378" aria-hidden="true" tabindex="-1"></a><span class="co">    # typically 4 bits</span></span>
+<span id="cb1-1379"><a href="#cb1-1379" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">loftq_bits</span><span class="kw">:</span><span class="at"> int = 4</span></span>
+<span id="cb1-1380"><a href="#cb1-1380" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1381"><a href="#cb1-1381" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use DoRA.</span></span>
+<span id="cb1-1382"><a href="#cb1-1382" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_dora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1383"><a href="#cb1-1383" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use RSLoRA.</span></span>
+<span id="cb1-1384"><a href="#cb1-1384" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_rslora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1385"><a href="#cb1-1385" aria-hidden="true" tabindex="-1"></a><span class="co"># List of layer indices to replicate.</span></span>
+<span id="cb1-1386"><a href="#cb1-1386" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layer_replication</span><span class="kw">:</span><span class="at"> list[tuple[int, int]] | None</span></span>
+<span id="cb1-1387"><a href="#cb1-1387" aria-hidden="true" tabindex="-1"></a><span class="co"># How to initialize LoRA weights. Default to True which is MS original implementation.</span></span>
+<span id="cb1-1388"><a href="#cb1-1388" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_init_lora_weights</span><span class="kw">:</span><span class="at"> bool | str | None</span></span>
+<span id="cb1-1389"><a href="#cb1-1389" aria-hidden="true" tabindex="-1"></a><span class="co"># A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict</span></span>
+<span id="cb1-1390"><a href="#cb1-1390" aria-hidden="true" tabindex="-1"></a><span class="co"># mapping an embedding layer name to its trainable token indices. See</span></span>
+<span id="cb1-1391"><a href="#cb1-1391" aria-hidden="true" tabindex="-1"></a><span class="co"># https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-</span></span>
+<span id="cb1-1392"><a href="#cb1-1392" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens-alongside-lora</span></span>
+<span id="cb1-1393"><a href="#cb1-1393" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_trainable_token_indices</span><span class="kw">:</span><span class="at"> list[int] | dict[str, list[int]] | None</span></span>
+<span id="cb1-1394"><a href="#cb1-1394" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to tie adapter weights for tied model weights. See</span></span>
+<span id="cb1-1395"><a href="#cb1-1395" aria-hidden="true" tabindex="-1"></a><span class="co"># https://github.com/huggingface/peft/issues/2864</span></span>
+<span id="cb1-1396"><a href="#cb1-1396" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_ensure_weight_tying</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1397"><a href="#cb1-1397" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.</span></span>
+<span id="cb1-1398"><a href="#cb1-1398" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_autocast_adapter_dtype</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1399"><a href="#cb1-1399" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1400"><a href="#cb1-1400" aria-hidden="true" tabindex="-1"></a><span class="co"># load qlora model in sharded format for FSDP using answer.ai technique.</span></span>
+<span id="cb1-1401"><a href="#cb1-1401" aria-hidden="true" tabindex="-1"></a><span class="fu">qlora_sharded_model_loading</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1402"><a href="#cb1-1402" aria-hidden="true" tabindex="-1"></a><span class="co"># Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it</span></span>
+<span id="cb1-1403"><a href="#cb1-1403" aria-hidden="true" tabindex="-1"></a><span class="co"># takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge</span></span>
+<span id="cb1-1404"><a href="#cb1-1404" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_on_cpu</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1405"><a href="#cb1-1405" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether you are training a 4-bit GPTQ quantized model</span></span>
+<span id="cb1-1406"><a href="#cb1-1406" aria-hidden="true" tabindex="-1"></a><span class="fu">gptq</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1407"><a href="#cb1-1407" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the bnb 4bit quantization configuration</span></span>
+<span id="cb1-1408"><a href="#cb1-1408" aria-hidden="true" tabindex="-1"></a><span class="fu">bnb_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1409"><a href="#cb1-1409" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1410"><a href="#cb1-1410" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.</span></span>
+<span id="cb1-1411"><a href="#cb1-1411" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1412"><a href="#cb1-1412" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate for lora embedding layers. Default value is 1e-6.</span></span>
+<span id="cb1-1413"><a href="#cb1-1413" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_embedding</span><span class="kw">:</span><span class="at"> float | None = 1e-06</span></span>
+<span id="cb1-1414"><a href="#cb1-1414" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1415"><a href="#cb1-1415" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_lora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1416"><a href="#cb1-1416" aria-hidden="true" tabindex="-1"></a><span class="co"># Method to use for LoRA merging. 'memory_efficient' (default) processes shards</span></span>
+<span id="cb1-1417"><a href="#cb1-1417" aria-hidden="true" tabindex="-1"></a><span class="co"># individually to reduce memory usage, 'legacy' loads the full model into memory.</span></span>
+<span id="cb1-1418"><a href="#cb1-1418" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_method</span><span class="kw">:</span><span class="at"> Literal['legacy', 'memory_efficient'] | None = memory_efficient</span></span>
+<span id="cb1-1419"><a href="#cb1-1419" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1420"><a href="#cb1-1420" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ReLoRA. Use with jagged_restart_*steps options.</span></span>
+<span id="cb1-1421"><a href="#cb1-1421" aria-hidden="true" tabindex="-1"></a><span class="fu">relora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1422"><a href="#cb1-1422" aria-hidden="true" tabindex="-1"></a><span class="co"># threshold for optimizer magnitude when pruning</span></span>
+<span id="cb1-1423"><a href="#cb1-1423" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_prune_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1424"><a href="#cb1-1424" aria-hidden="true" tabindex="-1"></a><span class="co"># True to perform lora weight merges on cpu during restarts, for modest gpu memory</span></span>
+<span id="cb1-1425"><a href="#cb1-1425" aria-hidden="true" tabindex="-1"></a><span class="co"># savings</span></span>
+<span id="cb1-1426"><a href="#cb1-1426" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_cpu_offload</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1427"><a href="#cb1-1427" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1428"><a href="#cb1-1428" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to reset for jagged restarts</span></span>
+<span id="cb1-1429"><a href="#cb1-1429" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1430"><a href="#cb1-1430" aria-hidden="true" tabindex="-1"></a><span class="co"># how many warmup steps to take after reset for jagged restarts</span></span>
+<span id="cb1-1431"><a href="#cb1-1431" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1432"><a href="#cb1-1432" aria-hidden="true" tabindex="-1"></a><span class="co"># how many anneal steps to take before reset for jagged restarts</span></span>
+<span id="cb1-1433"><a href="#cb1-1433" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_anneal_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1434"><a href="#cb1-1434" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1435"><a href="#cb1-1435" aria-hidden="true" tabindex="-1"></a><span class="co"># If greater than 1, backpropagation will be skipped and the gradients will be</span></span>
+<span id="cb1-1436"><a href="#cb1-1436" aria-hidden="true" tabindex="-1"></a><span class="co"># accumulated for the given number of steps.</span></span>
+<span id="cb1-1437"><a href="#cb1-1437" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_accumulation_steps</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
+<span id="cb1-1438"><a href="#cb1-1438" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples to include in each batch. This is the number of samples sent to</span></span>
+<span id="cb1-1439"><a href="#cb1-1439" aria-hidden="true" tabindex="-1"></a><span class="co"># each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps</span></span>
+<span id="cb1-1440"><a href="#cb1-1440" aria-hidden="true" tabindex="-1"></a><span class="fu">micro_batch_size</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
+<span id="cb1-1441"><a href="#cb1-1441" aria-hidden="true" tabindex="-1"></a><span class="co"># Total batch size, we do not recommended setting this manually</span></span>
+<span id="cb1-1442"><a href="#cb1-1442" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1443"><a href="#cb1-1443" aria-hidden="true" tabindex="-1"></a><span class="co"># per gpu micro batch size for evals, defaults to value of micro_batch_size</span></span>
+<span id="cb1-1444"><a href="#cb1-1444" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1445"><a href="#cb1-1445" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1446"><a href="#cb1-1446" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to find batch size that fits in memory. Passed to underlying transformers</span></span>
+<span id="cb1-1447"><a href="#cb1-1447" aria-hidden="true" tabindex="-1"></a><span class="co"># Trainer</span></span>
+<span id="cb1-1448"><a href="#cb1-1448" aria-hidden="true" tabindex="-1"></a><span class="fu">auto_find_batch_size</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1449"><a href="#cb1-1449" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1450"><a href="#cb1-1450" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to mask out or include the human's prompt from the training labels</span></span>
+<span id="cb1-1451"><a href="#cb1-1451" aria-hidden="true" tabindex="-1"></a><span class="fu">train_on_inputs</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1452"><a href="#cb1-1452" aria-hidden="true" tabindex="-1"></a><span class="co"># Group similarly sized data to minimize padding. May be slower to start, as it must</span></span>
+<span id="cb1-1453"><a href="#cb1-1453" aria-hidden="true" tabindex="-1"></a><span class="co"># download and sort the entire dataset. Note that training loss may have an oscillating</span></span>
+<span id="cb1-1454"><a href="#cb1-1454" aria-hidden="true" tabindex="-1"></a><span class="co"># pattern with this enabled.</span></span>
+<span id="cb1-1455"><a href="#cb1-1455" aria-hidden="true" tabindex="-1"></a><span class="fu">group_by_length</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1456"><a href="#cb1-1456" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1457"><a href="#cb1-1457" aria-hidden="true" tabindex="-1"></a><span class="fu">learning_rate</span><span class="kw">:</span><span class="at"> str | float (required)</span></span>
+<span id="cb1-1458"><a href="#cb1-1458" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1459"><a href="#cb1-1459" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr_scale</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1460"><a href="#cb1-1460" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify weight decay</span></span>
+<span id="cb1-1461"><a href="#cb1-1461" aria-hidden="true" tabindex="-1"></a><span class="fu">weight_decay</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-1462"><a href="#cb1-1462" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify optimizer</span></span>
+<span id="cb1-1463"><a href="#cb1-1463" aria-hidden="true" tabindex="-1"></a><span class="fu">optimizer</span><span class="kw">:</span><span class="at"> OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED</span></span>
+<span id="cb1-1464"><a href="#cb1-1464" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary of arguments to pass to the optimizer</span></span>
+<span id="cb1-1465"><a href="#cb1-1465" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_args</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
+<span id="cb1-1466"><a href="#cb1-1466" aria-hidden="true" tabindex="-1"></a><span class="co"># The target modules to optimize, i.e. the module names that you would like to train,</span></span>
+<span id="cb1-1467"><a href="#cb1-1467" aria-hidden="true" tabindex="-1"></a><span class="co"># right now this is used only for GaLore algorithm</span></span>
+<span id="cb1-1468"><a href="#cb1-1468" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_target_modules</span><span class="kw">:</span><span class="at"> list[str] | Literal['all_linear'] | None</span></span>
+<span id="cb1-1469"><a href="#cb1-1469" aria-hidden="true" tabindex="-1"></a><span class="co"># Path to torch distx for optim 'adamw_anyprecision'</span></span>
+<span id="cb1-1470"><a href="#cb1-1470" aria-hidden="true" tabindex="-1"></a><span class="fu">torchdistx_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1471"><a href="#cb1-1471" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler</span><span class="kw">:</span><span class="at"> SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE</span></span>
+<span id="cb1-1472"><a href="#cb1-1472" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a scheduler and kwargs to use with the optimizer</span></span>
+<span id="cb1-1473"><a href="#cb1-1473" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1474"><a href="#cb1-1474" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_quadratic_warmup</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1475"><a href="#cb1-1475" aria-hidden="true" tabindex="-1"></a><span class="co"># decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of</span></span>
+<span id="cb1-1476"><a href="#cb1-1476" aria-hidden="true" tabindex="-1"></a><span class="co"># peak lr</span></span>
+<span id="cb1-1477"><a href="#cb1-1477" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_min_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1478"><a href="#cb1-1478" aria-hidden="true" tabindex="-1"></a><span class="co"># freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means</span></span>
+<span id="cb1-1479"><a href="#cb1-1479" aria-hidden="true" tabindex="-1"></a><span class="co"># start cosine_min_lr at 80% of training step</span></span>
+<span id="cb1-1480"><a href="#cb1-1480" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_constant_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1481"><a href="#cb1-1481" aria-hidden="true" tabindex="-1"></a><span class="co"># Learning rate div factor</span></span>
+<span id="cb1-1482"><a href="#cb1-1482" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_div_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1483"><a href="#cb1-1483" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1484"><a href="#cb1-1484" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_groups</span><span class="kw">:</span><span class="at"> list[LrGroup] | None</span></span>
+<span id="cb1-1485"><a href="#cb1-1485" aria-hidden="true" tabindex="-1"></a><span class="co">  # For LrGroup:</span></span>
+<span id="cb1-1486"><a href="#cb1-1486" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str (required)</span></span>
+<span id="cb1-1487"><a href="#cb1-1487" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">modules</span><span class="kw">:</span><span class="at"> list[str] (required)</span></span>
+<span id="cb1-1488"><a href="#cb1-1488" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">lr</span><span class="kw">:</span><span class="at"> float (required)</span></span>
+<span id="cb1-1489"><a href="#cb1-1489" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1490"><a href="#cb1-1490" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
+<span id="cb1-1491"><a href="#cb1-1491" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1492"><a href="#cb1-1492" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
+<span id="cb1-1493"><a href="#cb1-1493" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon2</span><span class="kw">:</span><span class="at"> float | None</span></span>
 <span id="cb1-1494"><a href="#cb1-1494" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1495"><a href="#cb1-1495" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta2</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1496"><a href="#cb1-1496" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
-<span id="cb1-1497"><a href="#cb1-1497" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta3</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1498"><a href="#cb1-1498" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1499"><a href="#cb1-1499" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer learning rate</span></span>
-<span id="cb1-1500"><a href="#cb1-1500" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1501"><a href="#cb1-1501" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer momentum</span></span>
-<span id="cb1-1502"><a href="#cb1-1502" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_momentum</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1503"><a href="#cb1-1503" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank</span></span>
-<span id="cb1-1504"><a href="#cb1-1504" aria-hidden="true" tabindex="-1"></a><span class="co"># dimension.</span></span>
-<span id="cb1-1505"><a href="#cb1-1505" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_fraction</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
-<span id="cb1-1506"><a href="#cb1-1506" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may</span></span>
-<span id="cb1-1507"><a href="#cb1-1507" aria-hidden="true" tabindex="-1"></a><span class="co"># be useful to ensure even sharding.</span></span>
-<span id="cb1-1508"><a href="#cb1-1508" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_multiple_of</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1509"><a href="#cb1-1509" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1510"><a href="#cb1-1510" aria-hidden="true" tabindex="-1"></a><span class="co"># Gradient clipping max norm</span></span>
-<span id="cb1-1511"><a href="#cb1-1511" aria-hidden="true" tabindex="-1"></a><span class="fu">max_grad_norm</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1512"><a href="#cb1-1512" aria-hidden="true" tabindex="-1"></a><span class="fu">num_epochs</span><span class="kw">:</span><span class="at"> float = 1.0</span></span>
-<span id="cb1-1513"><a href="#cb1-1513" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1514"><a href="#cb1-1514" aria-hidden="true" tabindex="-1"></a><span class="fu">use_wandb</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1515"><a href="#cb1-1515" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your wandb run</span></span>
-<span id="cb1-1516"><a href="#cb1-1516" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1517"><a href="#cb1-1517" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the ID of your wandb run</span></span>
-<span id="cb1-1518"><a href="#cb1-1518" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_run_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1519"><a href="#cb1-1519" aria-hidden="true" tabindex="-1"></a><span class="co"># "offline" to save run metadata locally and not sync to the server, "disabled" to turn</span></span>
-<span id="cb1-1520"><a href="#cb1-1520" aria-hidden="true" tabindex="-1"></a><span class="co"># off wandb</span></span>
-<span id="cb1-1521"><a href="#cb1-1521" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1522"><a href="#cb1-1522" aria-hidden="true" tabindex="-1"></a><span class="co"># Your wandb project name</span></span>
-<span id="cb1-1523"><a href="#cb1-1523" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_project</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1524"><a href="#cb1-1524" aria-hidden="true" tabindex="-1"></a><span class="co"># A wandb Team name if using a Team</span></span>
-<span id="cb1-1525"><a href="#cb1-1525" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_entity</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1526"><a href="#cb1-1526" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_watch</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1527"><a href="#cb1-1527" aria-hidden="true" tabindex="-1"></a><span class="co"># "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only</span></span>
-<span id="cb1-1528"><a href="#cb1-1528" aria-hidden="true" tabindex="-1"></a><span class="co"># at the end of training</span></span>
-<span id="cb1-1529"><a href="#cb1-1529" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_log_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1530"><a href="#cb1-1530" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1531"><a href="#cb1-1531" aria-hidden="true" tabindex="-1"></a><span class="fu">use_mlflow</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1532"><a href="#cb1-1532" aria-hidden="true" tabindex="-1"></a><span class="co"># URI to mlflow</span></span>
-<span id="cb1-1533"><a href="#cb1-1533" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_tracking_uri</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1534"><a href="#cb1-1534" aria-hidden="true" tabindex="-1"></a><span class="co"># Your experiment name</span></span>
-<span id="cb1-1535"><a href="#cb1-1535" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_experiment_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1536"><a href="#cb1-1536" aria-hidden="true" tabindex="-1"></a><span class="co"># Your run name</span></span>
-<span id="cb1-1537"><a href="#cb1-1537" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1538"><a href="#cb1-1538" aria-hidden="true" tabindex="-1"></a><span class="co"># set to true to copy each saved checkpoint on each save to mlflow artifact registry</span></span>
-<span id="cb1-1539"><a href="#cb1-1539" aria-hidden="true" tabindex="-1"></a><span class="fu">hf_mlflow_log_artifacts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1540"><a href="#cb1-1540" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1541"><a href="#cb1-1541" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable or disable Comet integration.</span></span>
-<span id="cb1-1542"><a href="#cb1-1542" aria-hidden="true" tabindex="-1"></a><span class="fu">use_comet</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1543"><a href="#cb1-1543" aria-hidden="true" tabindex="-1"></a><span class="co"># API key for Comet. Recommended to set via `comet login`.</span></span>
-<span id="cb1-1544"><a href="#cb1-1544" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_api_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1545"><a href="#cb1-1545" aria-hidden="true" tabindex="-1"></a><span class="co"># Workspace name in Comet. Defaults to the user's default workspace.</span></span>
-<span id="cb1-1546"><a href="#cb1-1546" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_workspace</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1547"><a href="#cb1-1547" aria-hidden="true" tabindex="-1"></a><span class="co"># Project name in Comet. Defaults to Uncategorized.</span></span>
-<span id="cb1-1548"><a href="#cb1-1548" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1549"><a href="#cb1-1549" aria-hidden="true" tabindex="-1"></a><span class="co"># Identifier for the experiment. Used to append data to an existing experiment or</span></span>
-<span id="cb1-1550"><a href="#cb1-1550" aria-hidden="true" tabindex="-1"></a><span class="co"># control the key of new experiments. Default to a random key.</span></span>
-<span id="cb1-1551"><a href="#cb1-1551" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1552"><a href="#cb1-1552" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a new experiment ("create") or log to an existing one ("get"). Default</span></span>
-<span id="cb1-1553"><a href="#cb1-1553" aria-hidden="true" tabindex="-1"></a><span class="co"># ("get_or_create") auto-selects based on configuration.</span></span>
-<span id="cb1-1554"><a href="#cb1-1554" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1555"><a href="#cb1-1555" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to True to log data to Comet server, or False for offline storage. Default is</span></span>
-<span id="cb1-1556"><a href="#cb1-1556" aria-hidden="true" tabindex="-1"></a><span class="co"># True.</span></span>
-<span id="cb1-1557"><a href="#cb1-1557" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_online</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1558"><a href="#cb1-1558" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary for additional configuration settings, see the doc for more details.</span></span>
-<span id="cb1-1559"><a href="#cb1-1559" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1560"><a href="#cb1-1560" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1561"><a href="#cb1-1561" aria-hidden="true" tabindex="-1"></a><span class="fu">use_trackio</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1562"><a href="#cb1-1562" aria-hidden="true" tabindex="-1"></a><span class="co"># Your trackio project name</span></span>
-<span id="cb1-1563"><a href="#cb1-1563" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1564"><a href="#cb1-1564" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your trackio run</span></span>
-<span id="cb1-1565"><a href="#cb1-1565" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1566"><a href="#cb1-1566" aria-hidden="true" tabindex="-1"></a><span class="co"># Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)</span></span>
-<span id="cb1-1567"><a href="#cb1-1567" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_space_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1568"><a href="#cb1-1568" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1569"><a href="#cb1-1569" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable OpenTelemetry metrics collection and Prometheus export</span></span>
-<span id="cb1-1570"><a href="#cb1-1570" aria-hidden="true" tabindex="-1"></a><span class="fu">use_otel_metrics</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1571"><a href="#cb1-1571" aria-hidden="true" tabindex="-1"></a><span class="co"># Host to bind the OpenTelemetry metrics server to</span></span>
-<span id="cb1-1572"><a href="#cb1-1572" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_host</span><span class="kw">:</span><span class="at"> str | None = localhost</span></span>
-<span id="cb1-1573"><a href="#cb1-1573" aria-hidden="true" tabindex="-1"></a><span class="co"># Port for the Prometheus metrics HTTP server</span></span>
-<span id="cb1-1574"><a href="#cb1-1574" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_port</span><span class="kw">:</span><span class="at"> int | None = 8000</span></span>
-<span id="cb1-1575"><a href="#cb1-1575" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1576"><a href="#cb1-1576" aria-hidden="true" tabindex="-1"></a><span class="co"># the number of activate layers in LISA</span></span>
-<span id="cb1-1577"><a href="#cb1-1577" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_n_layers</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1578"><a href="#cb1-1578" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to switch layers in LISA</span></span>
-<span id="cb1-1579"><a href="#cb1-1579" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_step_interval</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1580"><a href="#cb1-1580" aria-hidden="true" tabindex="-1"></a><span class="co"># path under the model to access the layers</span></span>
-<span id="cb1-1581"><a href="#cb1-1581" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_layers_attribute</span><span class="kw">:</span><span class="at"> str | None = model.layers</span></span>
-<span id="cb1-1582"><a href="#cb1-1582" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1583"><a href="#cb1-1583" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_title</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1584"><a href="#cb1-1584" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_share</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1585"><a href="#cb1-1585" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1586"><a href="#cb1-1586" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_port</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1587"><a href="#cb1-1587" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1588"><a href="#cb1-1588" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1589"><a href="#cb1-1589" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1590"><a href="#cb1-1590" aria-hidden="true" tabindex="-1"></a><span class="fu">use_ray</span><span class="kw">:</span><span class="at"> bool = False</span></span>
-<span id="cb1-1591"><a href="#cb1-1591" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1592"><a href="#cb1-1592" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_num_workers</span><span class="kw">:</span><span class="at"> int = 1</span></span>
-<span id="cb1-1593"><a href="#cb1-1593" aria-hidden="true" tabindex="-1"></a><span class="fu">resources_per_worker</span><span class="kw">:</span><span class="at"> dict</span></span>
-<span id="cb1-1594"><a href="#cb1-1594" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1595"><a href="#cb1-1595" aria-hidden="true" tabindex="-1"></a><span class="co"># The size of the image to resize to. It can be an integer (resized into padded-square</span></span>
-<span id="cb1-1596"><a href="#cb1-1596" aria-hidden="true" tabindex="-1"></a><span class="co"># image) or a tuple (width, height).If not provided, we will attempt to load from</span></span>
-<span id="cb1-1597"><a href="#cb1-1597" aria-hidden="true" tabindex="-1"></a><span class="co"># preprocessor.size, otherwise, images won't be resized.</span></span>
-<span id="cb1-1598"><a href="#cb1-1598" aria-hidden="true" tabindex="-1"></a><span class="fu">image_size</span><span class="kw">:</span><span class="at"> int | tuple[int, int] | None</span></span>
-<span id="cb1-1599"><a href="#cb1-1599" aria-hidden="true" tabindex="-1"></a><span class="co"># The resampling algorithm to use for image resizing. Default is bilinear. Please refer</span></span>
-<span id="cb1-1600"><a href="#cb1-1600" aria-hidden="true" tabindex="-1"></a><span class="co"># to PIL.Image.Resampling for more details.</span></span>
-<span id="cb1-1601"><a href="#cb1-1601" aria-hidden="true" tabindex="-1"></a><span class="fu">image_resize_algorithm</span><span class="kw">:</span><span class="at"> Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None</span></span>
-<span id="cb1-1602"><a href="#cb1-1602" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1603"><a href="#cb1-1603" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the base model configuration</span></span>
-<span id="cb1-1604"><a href="#cb1-1604" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1605"><a href="#cb1-1605" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides the base model loading from_pretrained</span></span>
-<span id="cb1-1606"><a href="#cb1-1606" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1607"><a href="#cb1-1607" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to specify the type of model to load, AutoModelForCausalLM is a good</span></span>
-<span id="cb1-1608"><a href="#cb1-1608" aria-hidden="true" tabindex="-1"></a><span class="co"># choice too</span></span>
-<span id="cb1-1609"><a href="#cb1-1609" aria-hidden="true" tabindex="-1"></a><span class="fu">type_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1610"><a href="#cb1-1610" aria-hidden="true" tabindex="-1"></a><span class="co"># You can specify to choose a specific model revision from huggingface hub</span></span>
-<span id="cb1-1611"><a href="#cb1-1611" aria-hidden="true" tabindex="-1"></a><span class="fu">revision_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1612"><a href="#cb1-1612" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1613"><a href="#cb1-1613" aria-hidden="true" tabindex="-1"></a><span class="fu">max_packed_sequence_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1614"><a href="#cb1-1614" aria-hidden="true" tabindex="-1"></a><span class="fu">rope_scaling</span><span class="kw">:</span><span class="at"> Any | None</span></span>
-<span id="cb1-1615"><a href="#cb1-1615" aria-hidden="true" tabindex="-1"></a><span class="fu">noisy_embedding_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1616"><a href="#cb1-1616" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1617"><a href="#cb1-1617" aria-hidden="true" tabindex="-1"></a><span class="fu">evaluation_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1618"><a href="#cb1-1618" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_table_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1619"><a href="#cb1-1619" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1620"><a href="#cb1-1620" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_use_logits_to_keep</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1621"><a href="#cb1-1621" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_generate_during_eval</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1622"><a href="#cb1-1622" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_norm_loss</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1623"><a href="#cb1-1623" aria-hidden="true" tabindex="-1"></a><span class="fu">rpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-1495"><a href="#cb1-1495" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta1</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1496"><a href="#cb1-1496" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
+<span id="cb1-1497"><a href="#cb1-1497" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta2</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1498"><a href="#cb1-1498" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
+<span id="cb1-1499"><a href="#cb1-1499" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta3</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1500"><a href="#cb1-1500" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1501"><a href="#cb1-1501" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer learning rate</span></span>
+<span id="cb1-1502"><a href="#cb1-1502" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1503"><a href="#cb1-1503" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer momentum</span></span>
+<span id="cb1-1504"><a href="#cb1-1504" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_momentum</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1505"><a href="#cb1-1505" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank</span></span>
+<span id="cb1-1506"><a href="#cb1-1506" aria-hidden="true" tabindex="-1"></a><span class="co"># dimension.</span></span>
+<span id="cb1-1507"><a href="#cb1-1507" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_fraction</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
+<span id="cb1-1508"><a href="#cb1-1508" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may</span></span>
+<span id="cb1-1509"><a href="#cb1-1509" aria-hidden="true" tabindex="-1"></a><span class="co"># be useful to ensure even sharding.</span></span>
+<span id="cb1-1510"><a href="#cb1-1510" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_multiple_of</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
+<span id="cb1-1511"><a href="#cb1-1511" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1512"><a href="#cb1-1512" aria-hidden="true" tabindex="-1"></a><span class="co"># Gradient clipping max norm</span></span>
+<span id="cb1-1513"><a href="#cb1-1513" aria-hidden="true" tabindex="-1"></a><span class="fu">max_grad_norm</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1514"><a href="#cb1-1514" aria-hidden="true" tabindex="-1"></a><span class="fu">num_epochs</span><span class="kw">:</span><span class="at"> float = 1.0</span></span>
+<span id="cb1-1515"><a href="#cb1-1515" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1516"><a href="#cb1-1516" aria-hidden="true" tabindex="-1"></a><span class="fu">use_wandb</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1517"><a href="#cb1-1517" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your wandb run</span></span>
+<span id="cb1-1518"><a href="#cb1-1518" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1519"><a href="#cb1-1519" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the ID of your wandb run</span></span>
+<span id="cb1-1520"><a href="#cb1-1520" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_run_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1521"><a href="#cb1-1521" aria-hidden="true" tabindex="-1"></a><span class="co"># "offline" to save run metadata locally and not sync to the server, "disabled" to turn</span></span>
+<span id="cb1-1522"><a href="#cb1-1522" aria-hidden="true" tabindex="-1"></a><span class="co"># off wandb</span></span>
+<span id="cb1-1523"><a href="#cb1-1523" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1524"><a href="#cb1-1524" aria-hidden="true" tabindex="-1"></a><span class="co"># Your wandb project name</span></span>
+<span id="cb1-1525"><a href="#cb1-1525" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_project</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1526"><a href="#cb1-1526" aria-hidden="true" tabindex="-1"></a><span class="co"># A wandb Team name if using a Team</span></span>
+<span id="cb1-1527"><a href="#cb1-1527" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_entity</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1528"><a href="#cb1-1528" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_watch</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1529"><a href="#cb1-1529" aria-hidden="true" tabindex="-1"></a><span class="co"># "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only</span></span>
+<span id="cb1-1530"><a href="#cb1-1530" aria-hidden="true" tabindex="-1"></a><span class="co"># at the end of training</span></span>
+<span id="cb1-1531"><a href="#cb1-1531" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_log_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1532"><a href="#cb1-1532" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1533"><a href="#cb1-1533" aria-hidden="true" tabindex="-1"></a><span class="fu">use_mlflow</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1534"><a href="#cb1-1534" aria-hidden="true" tabindex="-1"></a><span class="co"># URI to mlflow</span></span>
+<span id="cb1-1535"><a href="#cb1-1535" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_tracking_uri</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1536"><a href="#cb1-1536" aria-hidden="true" tabindex="-1"></a><span class="co"># Your experiment name</span></span>
+<span id="cb1-1537"><a href="#cb1-1537" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_experiment_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1538"><a href="#cb1-1538" aria-hidden="true" tabindex="-1"></a><span class="co"># Your run name</span></span>
+<span id="cb1-1539"><a href="#cb1-1539" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1540"><a href="#cb1-1540" aria-hidden="true" tabindex="-1"></a><span class="co"># set to true to copy each saved checkpoint on each save to mlflow artifact registry</span></span>
+<span id="cb1-1541"><a href="#cb1-1541" aria-hidden="true" tabindex="-1"></a><span class="fu">hf_mlflow_log_artifacts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1542"><a href="#cb1-1542" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1543"><a href="#cb1-1543" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable or disable Comet integration.</span></span>
+<span id="cb1-1544"><a href="#cb1-1544" aria-hidden="true" tabindex="-1"></a><span class="fu">use_comet</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1545"><a href="#cb1-1545" aria-hidden="true" tabindex="-1"></a><span class="co"># API key for Comet. Recommended to set via `comet login`.</span></span>
+<span id="cb1-1546"><a href="#cb1-1546" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_api_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1547"><a href="#cb1-1547" aria-hidden="true" tabindex="-1"></a><span class="co"># Workspace name in Comet. Defaults to the user's default workspace.</span></span>
+<span id="cb1-1548"><a href="#cb1-1548" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_workspace</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1549"><a href="#cb1-1549" aria-hidden="true" tabindex="-1"></a><span class="co"># Project name in Comet. Defaults to Uncategorized.</span></span>
+<span id="cb1-1550"><a href="#cb1-1550" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1551"><a href="#cb1-1551" aria-hidden="true" tabindex="-1"></a><span class="co"># Identifier for the experiment. Used to append data to an existing experiment or</span></span>
+<span id="cb1-1552"><a href="#cb1-1552" aria-hidden="true" tabindex="-1"></a><span class="co"># control the key of new experiments. Default to a random key.</span></span>
+<span id="cb1-1553"><a href="#cb1-1553" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1554"><a href="#cb1-1554" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a new experiment ("create") or log to an existing one ("get"). Default</span></span>
+<span id="cb1-1555"><a href="#cb1-1555" aria-hidden="true" tabindex="-1"></a><span class="co"># ("get_or_create") auto-selects based on configuration.</span></span>
+<span id="cb1-1556"><a href="#cb1-1556" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1557"><a href="#cb1-1557" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to True to log data to Comet server, or False for offline storage. Default is</span></span>
+<span id="cb1-1558"><a href="#cb1-1558" aria-hidden="true" tabindex="-1"></a><span class="co"># True.</span></span>
+<span id="cb1-1559"><a href="#cb1-1559" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_online</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1560"><a href="#cb1-1560" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary for additional configuration settings, see the doc for more details.</span></span>
+<span id="cb1-1561"><a href="#cb1-1561" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1562"><a href="#cb1-1562" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1563"><a href="#cb1-1563" aria-hidden="true" tabindex="-1"></a><span class="fu">use_trackio</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1564"><a href="#cb1-1564" aria-hidden="true" tabindex="-1"></a><span class="co"># Your trackio project name</span></span>
+<span id="cb1-1565"><a href="#cb1-1565" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1566"><a href="#cb1-1566" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your trackio run</span></span>
+<span id="cb1-1567"><a href="#cb1-1567" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1568"><a href="#cb1-1568" aria-hidden="true" tabindex="-1"></a><span class="co"># Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)</span></span>
+<span id="cb1-1569"><a href="#cb1-1569" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_space_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1570"><a href="#cb1-1570" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1571"><a href="#cb1-1571" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable OpenTelemetry metrics collection and Prometheus export</span></span>
+<span id="cb1-1572"><a href="#cb1-1572" aria-hidden="true" tabindex="-1"></a><span class="fu">use_otel_metrics</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1573"><a href="#cb1-1573" aria-hidden="true" tabindex="-1"></a><span class="co"># Host to bind the OpenTelemetry metrics server to</span></span>
+<span id="cb1-1574"><a href="#cb1-1574" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_host</span><span class="kw">:</span><span class="at"> str | None = localhost</span></span>
+<span id="cb1-1575"><a href="#cb1-1575" aria-hidden="true" tabindex="-1"></a><span class="co"># Port for the Prometheus metrics HTTP server</span></span>
+<span id="cb1-1576"><a href="#cb1-1576" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_port</span><span class="kw">:</span><span class="at"> int | None = 8000</span></span>
+<span id="cb1-1577"><a href="#cb1-1577" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1578"><a href="#cb1-1578" aria-hidden="true" tabindex="-1"></a><span class="co"># the number of activate layers in LISA</span></span>
+<span id="cb1-1579"><a href="#cb1-1579" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_n_layers</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1580"><a href="#cb1-1580" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to switch layers in LISA</span></span>
+<span id="cb1-1581"><a href="#cb1-1581" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_step_interval</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1582"><a href="#cb1-1582" aria-hidden="true" tabindex="-1"></a><span class="co"># path under the model to access the layers</span></span>
+<span id="cb1-1583"><a href="#cb1-1583" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_layers_attribute</span><span class="kw">:</span><span class="at"> str | None = model.layers</span></span>
+<span id="cb1-1584"><a href="#cb1-1584" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1585"><a href="#cb1-1585" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_title</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1586"><a href="#cb1-1586" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_share</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1587"><a href="#cb1-1587" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1588"><a href="#cb1-1588" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_port</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1589"><a href="#cb1-1589" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1590"><a href="#cb1-1590" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1591"><a href="#cb1-1591" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1592"><a href="#cb1-1592" aria-hidden="true" tabindex="-1"></a><span class="fu">use_ray</span><span class="kw">:</span><span class="at"> bool = False</span></span>
+<span id="cb1-1593"><a href="#cb1-1593" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1594"><a href="#cb1-1594" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_num_workers</span><span class="kw">:</span><span class="at"> int = 1</span></span>
+<span id="cb1-1595"><a href="#cb1-1595" aria-hidden="true" tabindex="-1"></a><span class="fu">resources_per_worker</span><span class="kw">:</span><span class="at"> dict</span></span>
+<span id="cb1-1596"><a href="#cb1-1596" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1597"><a href="#cb1-1597" aria-hidden="true" tabindex="-1"></a><span class="co"># The size of the image to resize to. It can be an integer (resized into padded-square</span></span>
+<span id="cb1-1598"><a href="#cb1-1598" aria-hidden="true" tabindex="-1"></a><span class="co"># image) or a tuple (width, height).If not provided, we will attempt to load from</span></span>
+<span id="cb1-1599"><a href="#cb1-1599" aria-hidden="true" tabindex="-1"></a><span class="co"># preprocessor.size, otherwise, images won't be resized.</span></span>
+<span id="cb1-1600"><a href="#cb1-1600" aria-hidden="true" tabindex="-1"></a><span class="fu">image_size</span><span class="kw">:</span><span class="at"> int | tuple[int, int] | None</span></span>
+<span id="cb1-1601"><a href="#cb1-1601" aria-hidden="true" tabindex="-1"></a><span class="co"># The resampling algorithm to use for image resizing. Default is bilinear. Please refer</span></span>
+<span id="cb1-1602"><a href="#cb1-1602" aria-hidden="true" tabindex="-1"></a><span class="co"># to PIL.Image.Resampling for more details.</span></span>
+<span id="cb1-1603"><a href="#cb1-1603" aria-hidden="true" tabindex="-1"></a><span class="fu">image_resize_algorithm</span><span class="kw">:</span><span class="at"> Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None</span></span>
+<span id="cb1-1604"><a href="#cb1-1604" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1605"><a href="#cb1-1605" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the base model configuration</span></span>
+<span id="cb1-1606"><a href="#cb1-1606" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1607"><a href="#cb1-1607" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides the base model loading from_pretrained</span></span>
+<span id="cb1-1608"><a href="#cb1-1608" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1609"><a href="#cb1-1609" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to specify the type of model to load, AutoModelForCausalLM is a good</span></span>
+<span id="cb1-1610"><a href="#cb1-1610" aria-hidden="true" tabindex="-1"></a><span class="co"># choice too</span></span>
+<span id="cb1-1611"><a href="#cb1-1611" aria-hidden="true" tabindex="-1"></a><span class="fu">type_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1612"><a href="#cb1-1612" aria-hidden="true" tabindex="-1"></a><span class="co"># You can specify to choose a specific model revision from huggingface hub</span></span>
+<span id="cb1-1613"><a href="#cb1-1613" aria-hidden="true" tabindex="-1"></a><span class="fu">revision_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1614"><a href="#cb1-1614" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1615"><a href="#cb1-1615" aria-hidden="true" tabindex="-1"></a><span class="fu">max_packed_sequence_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1616"><a href="#cb1-1616" aria-hidden="true" tabindex="-1"></a><span class="fu">rope_scaling</span><span class="kw">:</span><span class="at"> Any | None</span></span>
+<span id="cb1-1617"><a href="#cb1-1617" aria-hidden="true" tabindex="-1"></a><span class="fu">noisy_embedding_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1618"><a href="#cb1-1618" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1619"><a href="#cb1-1619" aria-hidden="true" tabindex="-1"></a><span class="fu">evaluation_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1620"><a href="#cb1-1620" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_table_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1621"><a href="#cb1-1621" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1622"><a href="#cb1-1622" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_use_logits_to_keep</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1623"><a href="#cb1-1623" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_generate_during_eval</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1624"><a href="#cb1-1624" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_norm_loss</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1625"><a href="#cb1-1625" aria-hidden="true" tabindex="-1"></a><span class="fu">rpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 
 
 
diff --git a/search.json b/search.json
index b4029e615..8eff88459 100644
--- a/search.json
+++ b/search.json
@@ -3519,7 +3519,7 @@
     "href": "docs/config-reference.html",
     "title": "Config Reference",
     "section": "",
-    "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n# Reinitialize model weights randomly instead of loading pretrained weights\nreinit_weights: bool | None\n\n# module to custom trainer class to use for training\ntrainer_cls: str | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo', 'ebft'\nrl: RLType | None\n\ntrl: TRLConfig | None\n  # For TRLConfig:\n  # Beta parameter for the RL training. Same as `rl_beta`. Use\n  beta: float | None\n  # Maximum length of the completion for RL training.\n  max_completion_length: int | None\n\n  # Whether to use VLLM for RL training.\n  use_vllm: bool = False\n  # VLLM mode to use, one of 'server' or 'colocate'\n  vllm_mode: Literal['server', 'colocate'] | None\n  # Host of the vLLM server to connect to.\n  vllm_server_host: str | None = 0.0.0.0\n  # Port of the vLLM server to connect to.\n  vllm_server_port: int | None = 8000\n  # Total timeout (in seconds) to wait for the vLLM server to respond.\n  vllm_server_timeout: int | None\n  # Regex for vLLM guided decoding.\n  vllm_guided_decoding_regex: str | None\n\n  # List of reward functions to load. Paths must be importable from current dir.\n  reward_funcs: list[str] | None\n  # List of reward weights for the reward functions.\n  reward_weights: list[float] | None\n  # Batch size for generation. Controls how many unique prompts are generated per step.\n  # For full DP utilization, set to num_generations * data_parallel_size (or a multiple\n  # thereof).\n  generation_batch_size: int | None\n  # Number of generations to sample.\n  num_generations: int | None\n  # Whether to log completions.\n  log_completions: bool | None = False\n  # Number of completions to print when log_completions is True.\n  num_completions_to_print: int | None\n  # Controls whether importance sampling ratios are computed at the `'token'` or\n  # `'sequence'` level. For GSPO, use `sequence`, default is None which corresponds to\n  # the original GRPO paper.\n  importance_sampling_level: Literal['sequence', 'token'] | None\n\n  # Whether to sync the reference model.\n  sync_ref_model: bool | None = False\n  # Mixup alpha for the reference model.\n  ref_model_mixup_alpha: float | None = 0.9\n  # Sync steps for the reference model.\n  ref_model_sync_steps: int | None = 64\n  # Whether to scale rewards by their standard deviation.\n  scale_rewards: bool = True\n\n  # Sampling temperature for the GRPO policy.\n  temperature: float | None\n  # Top-p sampling probability for the generation policy.\n  top_p: float | None\n  # Top-k sampling for the generation policy.\n  top_k: int | None\n  # Minimum probability for the generation policy.\n  min_p: float | None\n  # Penalty for tokens that appear in prompt and generated text.\n  repetition_penalty: float | None\n  # Additional generation parameters passed to vLLM SamplingParams. Useful for\n  # stop_token_ids, seed, frequency_penalty, etc.\n  generation_kwargs: dict[str, Any] | None\n  # Additional kwargs for the chat template. E.g., {enable_thinking: false} for Qwen3.5\n  # models.\n  chat_template_kwargs: dict[str, Any] | None\n  # Number of iterations per batch (μ) for GRPO.\n  num_iterations: int | None\n  # Epsilon value for clipping in the GRPO algorithm.\n  epsilon: float | None\n  # Upper-bound epsilon value for clipping in the GRPO algorithm.\n  epsilon_high: float | None\n  # Whether to use Liger loss for GRPO.\n  use_liger_loss: bool | None\n  # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n  loss_type: str | None\n  # Whether to exclude truncated completions from loss calculation.\n  mask_truncated_completions: bool = False\n  # Enable sleep mode for vLLM to offload VRAM when idle\n  vllm_enable_sleep_mode: bool | None\n  # Path to custom rollout function. Must be importable from current dir.\n  rollout_func: str | None\n  # Multi-objective reward aggregation strategy. 'sum_then_normalize' (GRPO default):\n  # weights and sums rewards first, then normalizes. 'normalize_then_sum' (GDPO):\n  # normalizes each reward independently, then sums.\n  multi_objective_aggregation: Literal['sum_then_normalize', 'normalize_then_sum'] | None\n\n  # Use the GRPODataProducer protocol for online data generation.\n  use_data_producer: bool = False\n  # Generate rollouts in a background thread while training on the previous rollout.\n  async_prefetch: bool = False\n  # Number of rollouts to prefetch ahead of training.\n  prefetch_depth: int | None\n  # Sync model weights to vLLM every N optimizer steps (async mode only).\n  vllm_sync_interval: int | None\n  # Score prompt groups incrementally instead of the full batch at once.\n  streaming_partial_batch: bool | None\n  # Minimum prompt groups to score per streaming chunk.\n  streaming_min_groups: int | None\n  # Apply IS correction for distribution mismatch between vLLM and training model.\n  vllm_importance_sampling_correction: bool | None\n  # IS mode: token_truncate, token_mask, sequence_truncate, or sequence_mask.\n  vllm_importance_sampling_mode: Literal['token_truncate', 'token_mask', 'sequence_truncate', 'sequence_mask'] | None\n  # Cap C for IS ratio clipping/masking.\n  vllm_importance_sampling_cap: float | None\n  # KL threshold for off-policy sequence masking (OPSM). None = disabled.\n  off_policy_mask_threshold: float | None\n  # Apply IS correction to KL divergence term.\n  use_bias_correction_kl: bool | None\n\n  # Number of persistent subprocess workers for parallel reward computation. Each worker\n  # has its own main thread so signal.alarm() (used by math_verify) works correctly.\n  # Work is sharded across workers by prompt groups. Only used with\n  # use_data_producer=True and non-nn.Module reward functions.\n  reward_num_workers: int = 1\n  # [Experimental, disabled by default] Size of the replay buffer for storing high-\n  # signal rollout groups. When &gt; 0, groups with reward variance are cached and used to\n  # replace zero-signal groups (where all rewards are identical). Set to 0 to disable.\n  # Only used with use_data_producer=True.\n  replay_buffer_size: int = 0\n  # When True (default), recompute old_per_token_logps for replayed groups using the\n  # current training model. This fixes the importance sampling mismatch that occurs when\n  # replaying stale data. Only relevant when replay_buffer_size &gt; 0.\n  replay_recompute_logps: bool = True\n  # Fraction of total training steps after which deferred re-rolling begins. Zero-signal\n  # prompts (where all rewards in a group are identical) are buffered and re-injected\n  # into later batches when the model is more likely to solve them. Set to 1.0 to\n  # disable. Only used with use_data_producer=True.\n  reroll_start_fraction: float = 1.0\n  # Maximum number of prompt groups to replace with re-roll candidates per batch. Higher\n  # values increase data utilization but reduce prompt diversity. Only used with\n  # use_data_producer=True.\n  reroll_max_groups: int = 1\n  # When True, skip gradient computation for micro-batches where all advantages are zero\n  # (no learning signal). This avoids the forward/backward pass entirely when no\n  # learning signal is present. The step is logged with skipped_zero_adv_batches=1 for\n  # monitoring.\n  skip_zero_advantage_batches: bool = True\n  # Sync LoRA adapter to vLLM via filesystem instead of merging + NCCL broadcast. Auto-\n  # selects vllm_serve_lora serve module. Syncs only LoRA adapter weights vs full merged\n  # model.\n  vllm_lora_sync: bool = False\n\nvllm: VllmConfig | None\n  # For VllmConfig:\n  # Device to use for VLLM\n  device: str | None = auto\n  # Tensor parallel size for VLLM\n  tensor_parallel_size: int | None\n  # Data parallel size for VLLM\n  data_parallel_size: int | None\n  # GPU memory utilization for VLLM\n  gpu_memory_utilization: float | None = 0.9\n  # Data type for VLLM\n  dtype: str | None = auto\n  # Maximum length of the model context for VLLM\n  max_model_len: int | None\n  # Enable prefix caching for VLLM\n  enable_prefix_caching: bool | None\n  # Host for the vLLM server to start on\n  host: str | None = 0.0.0.0\n  # Port of the vLLM server to start on\n  port: int | None = 8000\n\n  # Enable reasoning for VLLM\n  enable_reasoning: bool | None\n  # Reasoning parser for VLLM\n  reasoning_parser: str | None\n  # Disable CUDA graph capture in vLLM. Required for models with causal_conv1d (e.g.,\n  # Qwen3.5 hybrid linear attention).\n  enforce_eager: bool | None\n  # Python module for vLLM serve script. Set to 'axolotl.scripts.vllm_serve_lora' for\n  # native LoRA support, or leave None for default TRL serve.\n  serve_module: str | None\n  # vLLM worker extension class for weight synchronization. Defaults to\n  # 'trl.scripts.vllm_serve.WeightSyncWorkerExtension'.\n  worker_extension_cls: str | None\n\n# Configuration for Energy-Based Fine-Tuning (EBFT)\nebft: EBFTConfig | None\n  # For EBFTConfig:\n  # Fractional layer depths for feature extraction (e.g., [0.25, 0.5, 0.75])\n  feature_layers: list[float] = [0.25, 0.5, 0.75]\n  # Embedding method: 'last_token', 'mean_pooling', 'completion_mean', or 'concat'\n  embed_method: Literal['last_token', 'mean_pooling', 'completion_mean', 'concat'] = last_token\n  # Apply SVD whitening to feature embeddings\n  use_whitening: bool = False\n  # Coefficient for alignment reward (cosine similarity with ground truth)\n  alignment_coef: float = 1.0\n  # Coefficient for diversity penalty (pairwise similarity between samples)\n  diversity_coef: float = 1.0\n  # Cross-entropy loss coefficient on ground-truth tokens\n  ce_coef: float = 0.0\n  # Set per-batch max_tokens based on ground-truth length\n  adaptive_max_tokens: bool = True\n  # Multiplier for ground-truth token count when computing adaptive max_tokens\n  gt_length_multiplier: float = 1.5\n\n  # EBFT mode: 'structured' (QA with vLLM) or 'strided' (unstructured text)\n  mode: Literal['structured', 'strided'] = structured\n  # Stride between anchor points (tokens)\n  stride: int = 8\n  # Context window size per block\n  context_length: int = 8\n  # Tokens to generate per block\n  generate_max_len: int = 8\n  # Independent rollouts per document\n  n_samples_per_prompt: int = 4\n  # Sampling temperature for strided generation\n  temperature: float = 0.6\n  # Top-p nucleus sampling threshold\n  top_p: float = 1.0\n  # RL policy gradient loss coefficient\n  rl_coef: float = 1.0\n  # Advantage estimator: 'rloo', 'group_norm', 'reinforce'\n  advantage_estimator: Literal['rloo', 'group_norm', 'reinforce'] = rloo\n  # Minimum tokens into completion before placing anchors. Skips anchors too close to\n  # the prompt boundary where features are dominated by prompt context.\n  min_completion_prefix: int = 0\n\nqat: QATConfig | None\n  # For QATConfig:\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Quantize embedding\n  quantize_embedding: bool | None = False\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n  # The number of steps to apply fake quantization after\n  fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n  # For PTQConfig:\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Whether to quantize the embedding layer.\n  quantize_embedding: bool | None\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n\n# Configuration for dynamic checkpointing (trigger by file or signal). Set 'enabled:\n# true' to activate this feature.\ndynamic_checkpoint: DynamicCheckpointConfig | None\n  # For DynamicCheckpointConfig:\n  # Enable dynamic checkpoint triggering during training. Create a file\n  # 'axolotl_checkpoint.save' in the configured `output_dir` to trigger.\n  enabled: bool = False\n  # Check for trigger file every N steps (reduces I/O overhead). Default: 100\n  check_interval: int = 10\n  # Custom trigger filename (optional). If not specified, defaults to\n  # 'axolotl_checkpoint.save'. Specify a filename (not a full path) to override the\n  # default.\n  trigger_file_path: str = \n\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\n# Coefficient to incentivize the reward model to output mean-zero rewards (proposed by\n# https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.\ncenter_rewards_coefficient: float | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_label_smoothing: float | None\n\n# Whether to use Liger kernel for DPO loss.\ndpo_use_liger_kernel: bool | None\n\ndpo_padding_free: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n  # For SyntheticDataset:\n  path: Literal['synthetic'] = synthetic\n  type: Literal['_synthetic'] = _synthetic\n  # Number of rows to generate\n  length: int = 1000\n  # Sequence length per row (defaults to sequence_len from config)\n  sequence_length: int | None\n  # Minimum token ID for generation\n  min_input_id: int = 100\n  # Maximum token ID for generation (defaults to tokenizer vocab_size)\n  max_input_id: int | None\n  # Random seed for reproducibility\n  seed: int | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n  # For SyntheticDataset:\n  path: Literal['synthetic'] = synthetic\n  type: Literal['_synthetic'] = _synthetic\n  # Number of rows to generate\n  length: int = 1000\n  # Sequence length per row (defaults to sequence_len from config)\n  sequence_length: int | None\n  # Minimum token ID for generation\n  min_input_id: int = 100\n  # Maximum token ID for generation (defaults to tokenizer vocab_size)\n  max_input_id: int | None\n  # Random seed for reproducibility\n  seed: int | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# If true, each dataset in `datasets` will be shuffled before merging. This allows\n# curriculum learning strategies to be applied at the dataset level. Default is false.\nshuffle_before_merging_datasets: bool | None = False\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n# Number of shards to save the prepared dataset\nnum_dataset_shards_to_save: int | None\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n  # For PretrainingDataset:\n  name: str | None\n  path: str | None\n  split: str | None = train\n  text_column: str | None = text\n  type: str | None = pretrain\n  trust_remote_code: bool | None = False\n  data_files: str | None\n  skip: int | None\n\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_processes: int | None\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_num_proc: int | None\n\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\n# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before\n# evaluations. Default is 0 (disabled).\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require &gt;=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\n# Enable FP8 mixed precision training using TorchAO. Best used in combination with\n# torch.compile.\nfp8: bool | None\n# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training\n# speed by 10-15% when FSDP is enabled.\nfp8_enable_fsdp_float8_all_gather: bool | None\n# No AMP (automatic mixed precision) - require &gt;=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# bool to use CUDA tf32 or 'auto' for automatic detection - require &gt;=ampere\ntf32: Literal['auto'] | bool | None = auto\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.\nactivation_offloading: Literal['legacy', 'disk'] | bool | None = False\n# Offload model layer parameters to CPU during forward, prefetch back during backward.\nlayer_offloading: bool | None = False\n\n# List of regex patterns for parameter names to keep unfrozen. All other parameters will\n# be frozen via requires_grad=False. Note: range-based patterns (e.g.\n# embed_tokens.weight$[:32000]) use gradient zeroing rather than a true freeze, so\n# weight decay will still apply to the frozen portion and optimizer states are allocated\n# for the full parameter.\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;\n# 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to\n# 'drop' for backward compatibility.\nexcess_length_strategy: Literal['drop', 'truncate', 'raise'] | None\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int | None\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (&lt;%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to\n# True if `sample_packing` enabled\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use streaming mode for loading datasets\nstreaming: bool | None\n# Buffer size for multipack streaming datasets\nstreaming_multipack_buffer_size: int | None = 10000\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n# Whether to use SageAttention https://github.com/thu-ml/SageAttention\nsage_attention: bool | None\n\neager_attention: bool | None\n\n# Specify a custom attention implementation, used mostly for kernels.\nattn_implementation: str | None\n\n# Which experts implementation to use for MoE models,\nexperts_implementation: str | None\n\n# Quantize MoE expert weights on load to reduce VRAM. Requires adapter (lora/qlora) with\n# load_in_4bit or load_in_8bit. Requires CUDA (not compatible with ROCm or other\n# backends). Note: total parameter count may be reported incorrectly when enabled\n# (trainable param count is correct).\nquantize_moe_experts: bool = False\n\n# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399\nscaling_softmax: bool | None\n# Scaling factor for SSMax attention. Default is 0.43\nscaling_softmax_factor: float | None\n# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better\n# length generalization.\nscaling_softmax_bias: float | None\n\nunsloth_cross_entropy_loss: bool | None\nunsloth_lora_mlp: bool | None\nunsloth_lora_qkv: bool | None\nunsloth_lora_o: bool | None\nunsloth_rms_norm: bool | None\nunsloth_rope: bool | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n# Apply custom LoRA autograd function for embedding layers. See:\n# https://docs.axolotl.ai/docs/lora_optims.html\nlora_embedding_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n# Enable Entropy-Aware Focal Training loss (EAFT)\nuse_eaft: bool | None\n# Exponent for entropy weighting in EAFT (default: 1.0)\neaft_alpha: float | None = 1.0\n# Number of top logits for entropy approximation (default: 20)\neaft_k: int | None = 20\n\n# Whether to use ALST tiled mlp for memory efficient long context\ntiled_mlp: bool | None\n\n# Number of shards to use for ALST tiled mlp. If unset, it will be set based on\n# seqlen/hidden_size\ntiled_mlp_num_shards: int | None\n\n# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on\n# llama.\ntiled_mlp_use_original_mlp: bool | None = True\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# Whether to use deepcompile for faster training with deepspeed\ndeepcompile: bool | None\n# FSDP configuration\nfsdp: list[str] | None\n\n# FSDP configuration options\nfsdp_config: FSDPConfig | None\n  # For FSDPConfig:\n  # FSDP version\n  fsdp_version: int | None\n  # Enable activation checkpointing to reduce memory usage during forward passes\n  activation_checkpointing: bool | None\n  # Offload parameters to CPU to reduce GPU memory usage\n  offload_params: bool | None\n  # Synchronize module states across all processes\n  sync_module_states: bool | None\n  # Enable CPU RAM efficient loading to reduce memory usage during model loading\n  cpu_ram_efficient_loading: bool | None\n  # Disabling this enables swap memory usage for resource-constrained setups when\n  # offload_params is enabled.\n  cpu_offload_pin_memory: bool | None\n  # Use original parameters instead of flattened parameters\n  use_orig_params: bool | None\n\n  # Type of state dict to use for saving/loading checkpoints\n  state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n  # Final state dict type to use after training completion\n  final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n  # Policy for automatically wrapping modules with FSDP\n  auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None\n  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')\n  transformer_layer_cls_to_wrap: str | None\n\n  # Reshard parameters after forward pass to save memory\n  reshard_after_forward: bool | None\n  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')\n  mixed_precision_policy: str | None\n\n# FSDP version\nfsdp_version: int | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Number of devices to shard across. If not set, will use all available devices.\ndp_shard_size: int | None\n# Number of devices to replicate across.\ndp_replicate_size: int | None\n# Deprecated: use `context_parallel_size` instead\nsequence_parallel_degree: int | None\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\ncontext_parallel_size: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.\ntensor_parallel_size: int | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n  # For SpecialTokensConfig:\n  bos_token: str | None\n  eos_token: str | None\n  pad_token: str | None\n  unk_token: str | None\n  additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch&gt;=2.6.0\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Whether to checkpoint a model after the first step of training. Defaults to False.\nsave_first_step: bool | None\n\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# Which step to start the profiler at. Useful for only capturing a few steps mid-run.\nprofiler_steps_start: int | None = 0\n# bool of whether to report tokens per second at the end of training. This is not\n# supported with pre-training datasets.\ninclude_tokens_per_second: bool | None\n# bool of whether to report tokens per second per-gpu during training by measuring\n# throughput of non-padding tokens.\ninclude_tkps: bool | None = True\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template or path to jinja file for chat template. This will be only used\n# if chat_template is set to `jinja` or `null` (in which case chat_template is\n# automatically set to `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\n# Token index or indices to adjust embedding weights to the mean of the other tokens.\n# This is useful when the model has untrained embeddings.\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n# Enable sample generation during training for monitoring\ngenerate_samples: bool | None = False\n# Number of samples to generate at each interval\nnum_generation_samples: int | None = 3\n# Maximum new tokens to generate per sample\ngeneration_max_new_tokens: int | None = 50\n# Temperature for sample generation (0.0 = greedy)\ngeneration_temperature: float | None = 0.7\n# Nucleus sampling parameter for generation\ngeneration_top_p: float | None\n# Top-k sampling parameter for generation\ngeneration_top_k: int | None\n# Ratio of input to use as prompt (0.0-1.0)\ngeneration_prompt_ratio: float | None = 0.5\n# Whether to use sampling (vs greedy decoding)\ngeneration_do_sample: bool | None = True\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\n# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to\n# AutoConfig.\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Whether to save jinja files for tokenizer, transformers default is True\ntokenizer_save_jinja_files: bool | None = True\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Don't move the model to the device before sharding. Set to `false` to revert to legacy\n# behavior.\nexperimental_skip_move_to_device: bool | None = True\n\n# Use custom kernels, e.g. MegaBlocks.\nuse_kernels: bool | None\n\n# Model loading quantization config\nmodel_quantization_config: Literal['Mxfp4Config'] | None\n# kwargs for model quantization config\nmodel_quantization_config_kwargs: dict[str, Any] | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# branch/revision to push to on hub (default: main)\nhub_revision: str | None\n# Whether to save the model using safetensors format. Defaults to True.\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all\n# parameters in original model\nadapter: Literal['lora', 'qlora', 'llama-adapter'] | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\nlora_target_parameters: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n  # For PeftConfig:\n  # Configuration options for loftq initialization for LoRA\n  loftq_config: LoftQConfig | None\n    # For LoftQConfig:\n    # typically 4 bits\n    loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict\n# mapping an embedding layer name to its trainable token indices. See\n# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-\n# tokens-alongside-lora\npeft_trainable_token_indices: list[int] | dict[str, list[int]] | None\n# Whether to tie adapter weights for tied model weights. See\n# https://github.com/huggingface/peft/issues/2864\npeft_ensure_weight_tying: bool | None\n# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.\npeft_autocast_adapter_dtype: bool | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n# Method to use for LoRA merging. 'memory_efficient' (default) processes shards\n# individually to reduce memory usage, 'legacy' loads the full model into memory.\nmerge_method: Literal['legacy', 'memory_efficient'] | None = memory_efficient\n\n# Whether to use ReLoRA. Use with jagged_restart_*steps options.\nrelora: bool | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# how often to reset for jagged restarts\njagged_restart_steps: int | None\n# how many warmup steps to take after reset for jagged restarts\njagged_restart_warmup_steps: int | None\n# how many anneal steps to take before reset for jagged restarts\njagged_restart_anneal_steps: int | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n  # For LrGroup:\n  name: str (required)\n  modules: list[str] (required)\n  lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n\n# Dion Optimizer learning rate\ndion_lr: float | None\n# Dion Optimizer momentum\ndion_momentum: float | None\n# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank\n# dimension.\ndion_rank_fraction: float | None = 1.0\n# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may\n# be useful to ensure even sharding.\ndion_rank_multiple_of: int | None = 1\n\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\nuse_trackio: bool | None\n# Your trackio project name\ntrackio_project_name: str | None\n# Set the name of your trackio run\ntrackio_run_name: str | None\n# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)\ntrackio_space_id: str | None\n\n# Enable OpenTelemetry metrics collection and Prometheus export\nuse_otel_metrics: bool | None = False\n# Host to bind the OpenTelemetry metrics server to\notel_metrics_host: str | None = localhost\n# Port for the Prometheus metrics HTTP server\notel_metrics_port: int | None = 8000\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None\neval_table_size: int | None\neval_max_new_tokens: int | None\ndpo_use_logits_to_keep: bool | None\ndpo_generate_during_eval: bool | None\ndpo_norm_loss: bool | None\nrpo_alpha: float | None",
+    "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n# Reinitialize model weights randomly instead of loading pretrained weights\nreinit_weights: bool | None\n\n# module to custom trainer class to use for training\ntrainer_cls: str | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo', 'ebft'\nrl: RLType | None\n\ntrl: TRLConfig | None\n  # For TRLConfig:\n  # Beta parameter for the RL training. Same as `rl_beta`. Use\n  beta: float | None\n  # Maximum length of the completion for RL training.\n  max_completion_length: int | None\n\n  # Whether to use VLLM for RL training.\n  use_vllm: bool = False\n  # VLLM mode to use, one of 'server' or 'colocate'\n  vllm_mode: Literal['server', 'colocate'] | None\n  # Host of the vLLM server to connect to.\n  vllm_server_host: str | None = 0.0.0.0\n  # Port of the vLLM server to connect to.\n  vllm_server_port: int | None = 8000\n  # Total timeout (in seconds) to wait for the vLLM server to respond.\n  vllm_server_timeout: int | None\n  # Regex for vLLM guided decoding.\n  vllm_guided_decoding_regex: str | None\n\n  # List of reward functions to load. Paths must be importable from current dir.\n  reward_funcs: list[str] | None\n  # List of reward weights for the reward functions.\n  reward_weights: list[float] | None\n  # Batch size for generation. Controls how many unique prompts are generated per step.\n  # For full DP utilization, set to num_generations * data_parallel_size (or a multiple\n  # thereof).\n  generation_batch_size: int | None\n  # Number of generations to sample.\n  num_generations: int | None\n  # Whether to log completions.\n  log_completions: bool | None = False\n  # Number of completions to print when log_completions is True.\n  num_completions_to_print: int | None\n  # Controls whether importance sampling ratios are computed at the `'token'` or\n  # `'sequence'` level. For GSPO, use `sequence`, default is None which corresponds to\n  # the original GRPO paper.\n  importance_sampling_level: Literal['sequence', 'token'] | None\n\n  # Whether to sync the reference model.\n  sync_ref_model: bool | None = False\n  # Mixup alpha for the reference model.\n  ref_model_mixup_alpha: float | None = 0.9\n  # Sync steps for the reference model.\n  ref_model_sync_steps: int | None = 64\n  # Whether to scale rewards by their standard deviation.\n  scale_rewards: bool = True\n\n  # Sampling temperature for the GRPO policy.\n  temperature: float | None\n  # Top-p sampling probability for the generation policy.\n  top_p: float | None\n  # Top-k sampling for the generation policy.\n  top_k: int | None\n  # Minimum probability for the generation policy.\n  min_p: float | None\n  # Penalty for tokens that appear in prompt and generated text.\n  repetition_penalty: float | None\n  # Additional generation parameters passed to vLLM SamplingParams. Useful for\n  # stop_token_ids, seed, frequency_penalty, etc.\n  generation_kwargs: dict[str, Any] | None\n  # Additional kwargs for the chat template. E.g., {enable_thinking: false} for Qwen3.5\n  # models.\n  chat_template_kwargs: dict[str, Any] | None\n  # Number of iterations per batch (μ) for GRPO.\n  num_iterations: int | None\n  # Epsilon value for clipping in the GRPO algorithm.\n  epsilon: float | None\n  # Upper-bound epsilon value for clipping in the GRPO algorithm.\n  epsilon_high: float | None\n  # Whether to use Liger loss for GRPO.\n  use_liger_loss: bool | None\n  # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n  loss_type: str | None\n  # Whether to exclude truncated completions from loss calculation.\n  mask_truncated_completions: bool = False\n  # Enable sleep mode for vLLM to offload VRAM when idle\n  vllm_enable_sleep_mode: bool | None\n  # Path to custom rollout function. Must be importable from current dir.\n  rollout_func: str | None\n  # Multi-objective reward aggregation strategy. 'sum_then_normalize' (GRPO default):\n  # weights and sums rewards first, then normalizes. 'normalize_then_sum' (GDPO):\n  # normalizes each reward independently, then sums.\n  multi_objective_aggregation: Literal['sum_then_normalize', 'normalize_then_sum'] | None\n\n  # Use the GRPODataProducer protocol for online data generation.\n  use_data_producer: bool = False\n  # Generate rollouts in a background thread while training on the previous rollout.\n  async_prefetch: bool = False\n  # Number of rollouts to prefetch ahead of training.\n  prefetch_depth: int | None\n  # Sync model weights to vLLM every N optimizer steps (async mode only).\n  vllm_sync_interval: int | None\n  # Score prompt groups incrementally instead of the full batch at once.\n  streaming_partial_batch: bool | None\n  # Minimum prompt groups to score per streaming chunk.\n  streaming_min_groups: int | None\n  # Apply IS correction for distribution mismatch between vLLM and training model.\n  vllm_importance_sampling_correction: bool | None\n  # IS mode: token_truncate, token_mask, sequence_truncate, or sequence_mask.\n  vllm_importance_sampling_mode: Literal['token_truncate', 'token_mask', 'sequence_truncate', 'sequence_mask'] | None\n  # Cap C for IS ratio clipping/masking.\n  vllm_importance_sampling_cap: float | None\n  # KL threshold for off-policy sequence masking (OPSM). None = disabled.\n  off_policy_mask_threshold: float | None\n  # Apply IS correction to KL divergence term.\n  use_bias_correction_kl: bool | None\n\n  # Number of persistent subprocess workers for parallel reward computation. Each worker\n  # has its own main thread so signal.alarm() (used by math_verify) works correctly.\n  # Work is sharded across workers by prompt groups. Only used with\n  # use_data_producer=True and non-nn.Module reward functions.\n  reward_num_workers: int = 1\n  # [Experimental, disabled by default] Size of the replay buffer for storing high-\n  # signal rollout groups. When &gt; 0, groups with reward variance are cached and used to\n  # replace zero-signal groups (where all rewards are identical). Set to 0 to disable.\n  # Only used with use_data_producer=True.\n  replay_buffer_size: int = 0\n  # When True (default), recompute old_per_token_logps for replayed groups using the\n  # current training model. This fixes the importance sampling mismatch that occurs when\n  # replaying stale data. Only relevant when replay_buffer_size &gt; 0.\n  replay_recompute_logps: bool = True\n  # Fraction of total training steps after which deferred re-rolling begins. Zero-signal\n  # prompts (where all rewards in a group are identical) are buffered and re-injected\n  # into later batches when the model is more likely to solve them. Set to 1.0 to\n  # disable. Only used with use_data_producer=True.\n  reroll_start_fraction: float = 1.0\n  # Maximum number of prompt groups to replace with re-roll candidates per batch. Higher\n  # values increase data utilization but reduce prompt diversity. Only used with\n  # use_data_producer=True.\n  reroll_max_groups: int = 1\n  # When True, skip gradient computation for micro-batches where all advantages are zero\n  # (no learning signal). This avoids the forward/backward pass entirely when no\n  # learning signal is present. The step is logged with skipped_zero_adv_batches=1 for\n  # monitoring.\n  skip_zero_advantage_batches: bool = True\n  # Sync LoRA adapter to vLLM via filesystem instead of merging + NCCL broadcast. Auto-\n  # selects vllm_serve_lora serve module. Syncs only LoRA adapter weights vs full merged\n  # model.\n  vllm_lora_sync: bool = False\n\nvllm: VllmConfig | None\n  # For VllmConfig:\n  # Device to use for VLLM\n  device: str | None = auto\n  # Tensor parallel size for VLLM\n  tensor_parallel_size: int | None\n  # Data parallel size for VLLM\n  data_parallel_size: int | None\n  # GPU memory utilization for VLLM\n  gpu_memory_utilization: float | None = 0.9\n  # Data type for VLLM\n  dtype: str | None = auto\n  # Maximum length of the model context for VLLM\n  max_model_len: int | None\n  # Enable prefix caching for VLLM\n  enable_prefix_caching: bool | None\n  # Host for the vLLM server to start on\n  host: str | None = 0.0.0.0\n  # Port of the vLLM server to start on\n  port: int | None = 8000\n\n  # Enable reasoning for VLLM\n  enable_reasoning: bool | None\n  # Reasoning parser for VLLM\n  reasoning_parser: str | None\n  # Disable CUDA graph capture in vLLM. Required for models with causal_conv1d (e.g.,\n  # Qwen3.5 hybrid linear attention).\n  enforce_eager: bool | None\n  # Python module for vLLM serve script. Set to 'axolotl.scripts.vllm_serve_lora' for\n  # native LoRA support, or leave None for default TRL serve.\n  serve_module: str | None\n  # vLLM worker extension class for weight synchronization. Defaults to\n  # 'trl.scripts.vllm_serve.WeightSyncWorkerExtension'.\n  worker_extension_cls: str | None\n\n# Configuration for Energy-Based Fine-Tuning (EBFT)\nebft: EBFTConfig | None\n  # For EBFTConfig:\n  # Fractional layer depths for feature extraction (e.g., [0.25, 0.5, 0.75])\n  feature_layers: list[float] = [0.25, 0.5, 0.75]\n  # Embedding method: 'last_token', 'mean_pooling', 'completion_mean', or 'concat'\n  embed_method: Literal['last_token', 'mean_pooling', 'completion_mean', 'concat'] = last_token\n  # Apply SVD whitening to feature embeddings\n  use_whitening: bool = False\n  # Coefficient for alignment reward (cosine similarity with ground truth)\n  alignment_coef: float = 1.0\n  # Coefficient for diversity penalty (pairwise similarity between samples)\n  diversity_coef: float = 1.0\n  # Cross-entropy loss coefficient on ground-truth tokens\n  ce_coef: float = 0.0\n  # Set per-batch max_tokens based on ground-truth length\n  adaptive_max_tokens: bool = True\n  # Multiplier for ground-truth token count when computing adaptive max_tokens\n  gt_length_multiplier: float = 1.5\n\n  # EBFT mode: 'structured' (QA with vLLM) or 'strided' (unstructured text)\n  mode: Literal['structured', 'strided'] = structured\n  # Stride between anchor points (tokens)\n  stride: int = 8\n  # Context window size per block\n  context_length: int = 8\n  # Tokens to generate per block\n  generate_max_len: int = 8\n  # Independent rollouts per document\n  n_samples_per_prompt: int = 4\n  # Sampling temperature for strided generation\n  temperature: float = 0.6\n  # Top-p nucleus sampling threshold\n  top_p: float = 1.0\n  # RL policy gradient loss coefficient\n  rl_coef: float = 1.0\n  # Advantage estimator: 'rloo', 'group_norm', 'reinforce'\n  advantage_estimator: Literal['rloo', 'group_norm', 'reinforce'] = rloo\n  # Minimum tokens into completion before placing anchors. Skips anchors too close to\n  # the prompt boundary where features are dominated by prompt context.\n  min_completion_prefix: int = 0\n\nqat: QATConfig | None\n  # For QATConfig:\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Quantize embedding\n  quantize_embedding: bool | None = False\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n  # The number of steps to apply fake quantization after\n  fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n  # For PTQConfig:\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Whether to quantize the embedding layer.\n  quantize_embedding: bool | None\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n\n# Configuration for dynamic checkpointing (trigger by file or signal). Set 'enabled:\n# true' to activate this feature.\ndynamic_checkpoint: DynamicCheckpointConfig | None\n  # For DynamicCheckpointConfig:\n  # Enable dynamic checkpoint triggering during training. Create a file\n  # 'axolotl_checkpoint.save' in the configured `output_dir` to trigger.\n  enabled: bool = False\n  # Check for trigger file every N steps (reduces I/O overhead). Default: 100\n  check_interval: int = 10\n  # Custom trigger filename (optional). If not specified, defaults to\n  # 'axolotl_checkpoint.save'. Specify a filename (not a full path) to override the\n  # default.\n  trigger_file_path: str = \n\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\n# Coefficient to incentivize the reward model to output mean-zero rewards (proposed by\n# https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.\ncenter_rewards_coefficient: float | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_label_smoothing: float | None\n# Precompute reference model log probabilities for DPO\nprecompute_ref_log_probs: bool | None\n\n# Whether to use Liger kernel for DPO loss.\ndpo_use_liger_kernel: bool | None\n\ndpo_padding_free: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n  # For SyntheticDataset:\n  path: Literal['synthetic'] = synthetic\n  type: Literal['_synthetic'] = _synthetic\n  # Number of rows to generate\n  length: int = 1000\n  # Sequence length per row (defaults to sequence_len from config)\n  sequence_length: int | None\n  # Minimum token ID for generation\n  min_input_id: int = 100\n  # Maximum token ID for generation (defaults to tokenizer vocab_size)\n  max_input_id: int | None\n  # Random seed for reproducibility\n  seed: int | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n  # For SyntheticDataset:\n  path: Literal['synthetic'] = synthetic\n  type: Literal['_synthetic'] = _synthetic\n  # Number of rows to generate\n  length: int = 1000\n  # Sequence length per row (defaults to sequence_len from config)\n  sequence_length: int | None\n  # Minimum token ID for generation\n  min_input_id: int = 100\n  # Maximum token ID for generation (defaults to tokenizer vocab_size)\n  max_input_id: int | None\n  # Random seed for reproducibility\n  seed: int | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# If true, each dataset in `datasets` will be shuffled before merging. This allows\n# curriculum learning strategies to be applied at the dataset level. Default is false.\nshuffle_before_merging_datasets: bool | None = False\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n# Number of shards to save the prepared dataset\nnum_dataset_shards_to_save: int | None\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n  # For PretrainingDataset:\n  name: str | None\n  path: str | None\n  split: str | None = train\n  text_column: str | None = text\n  type: str | None = pretrain\n  trust_remote_code: bool | None = False\n  data_files: str | None\n  skip: int | None\n\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_processes: int | None\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_num_proc: int | None\n\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\n# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before\n# evaluations. Default is 0 (disabled).\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require &gt;=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\n# Enable FP8 mixed precision training using TorchAO. Best used in combination with\n# torch.compile.\nfp8: bool | None\n# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training\n# speed by 10-15% when FSDP is enabled.\nfp8_enable_fsdp_float8_all_gather: bool | None\n# No AMP (automatic mixed precision) - require &gt;=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# bool to use CUDA tf32 or 'auto' for automatic detection - require &gt;=ampere\ntf32: Literal['auto'] | bool | None = auto\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.\nactivation_offloading: Literal['legacy', 'disk'] | bool | None = False\n# Offload model layer parameters to CPU during forward, prefetch back during backward.\nlayer_offloading: bool | None = False\n\n# List of regex patterns for parameter names to keep unfrozen. All other parameters will\n# be frozen via requires_grad=False. Note: range-based patterns (e.g.\n# embed_tokens.weight$[:32000]) use gradient zeroing rather than a true freeze, so\n# weight decay will still apply to the frozen portion and optimizer states are allocated\n# for the full parameter.\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;\n# 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to\n# 'drop' for backward compatibility.\nexcess_length_strategy: Literal['drop', 'truncate', 'raise'] | None\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int | None\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (&lt;%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to\n# True if `sample_packing` enabled\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use streaming mode for loading datasets\nstreaming: bool | None\n# Buffer size for multipack streaming datasets\nstreaming_multipack_buffer_size: int | None = 10000\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n# Whether to use SageAttention https://github.com/thu-ml/SageAttention\nsage_attention: bool | None\n\neager_attention: bool | None\n\n# Specify a custom attention implementation, used mostly for kernels.\nattn_implementation: str | None\n\n# Which experts implementation to use for MoE models,\nexperts_implementation: str | None\n\n# Quantize MoE expert weights on load to reduce VRAM. Requires adapter (lora/qlora) with\n# load_in_4bit or load_in_8bit. Requires CUDA (not compatible with ROCm or other\n# backends). Note: total parameter count may be reported incorrectly when enabled\n# (trainable param count is correct).\nquantize_moe_experts: bool = False\n\n# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399\nscaling_softmax: bool | None\n# Scaling factor for SSMax attention. Default is 0.43\nscaling_softmax_factor: float | None\n# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better\n# length generalization.\nscaling_softmax_bias: float | None\n\nunsloth_cross_entropy_loss: bool | None\nunsloth_lora_mlp: bool | None\nunsloth_lora_qkv: bool | None\nunsloth_lora_o: bool | None\nunsloth_rms_norm: bool | None\nunsloth_rope: bool | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n# Apply custom LoRA autograd function for embedding layers. See:\n# https://docs.axolotl.ai/docs/lora_optims.html\nlora_embedding_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n# Enable Entropy-Aware Focal Training loss (EAFT)\nuse_eaft: bool | None\n# Exponent for entropy weighting in EAFT (default: 1.0)\neaft_alpha: float | None = 1.0\n# Number of top logits for entropy approximation (default: 20)\neaft_k: int | None = 20\n\n# Whether to use ALST tiled mlp for memory efficient long context\ntiled_mlp: bool | None\n\n# Number of shards to use for ALST tiled mlp. If unset, it will be set based on\n# seqlen/hidden_size\ntiled_mlp_num_shards: int | None\n\n# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on\n# llama.\ntiled_mlp_use_original_mlp: bool | None = True\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# Whether to use deepcompile for faster training with deepspeed\ndeepcompile: bool | None\n# FSDP configuration\nfsdp: list[str] | None\n\n# FSDP configuration options\nfsdp_config: FSDPConfig | None\n  # For FSDPConfig:\n  # FSDP version\n  fsdp_version: int | None\n  # Enable activation checkpointing to reduce memory usage during forward passes\n  activation_checkpointing: bool | None\n  # Offload parameters to CPU to reduce GPU memory usage\n  offload_params: bool | None\n  # Synchronize module states across all processes\n  sync_module_states: bool | None\n  # Enable CPU RAM efficient loading to reduce memory usage during model loading\n  cpu_ram_efficient_loading: bool | None\n  # Disabling this enables swap memory usage for resource-constrained setups when\n  # offload_params is enabled.\n  cpu_offload_pin_memory: bool | None\n  # Use original parameters instead of flattened parameters\n  use_orig_params: bool | None\n\n  # Type of state dict to use for saving/loading checkpoints\n  state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n  # Final state dict type to use after training completion\n  final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n  # Policy for automatically wrapping modules with FSDP\n  auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None\n  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')\n  transformer_layer_cls_to_wrap: str | None\n\n  # Reshard parameters after forward pass to save memory\n  reshard_after_forward: bool | None\n  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')\n  mixed_precision_policy: str | None\n\n# FSDP version\nfsdp_version: int | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Number of devices to shard across. If not set, will use all available devices.\ndp_shard_size: int | None\n# Number of devices to replicate across.\ndp_replicate_size: int | None\n# Deprecated: use `context_parallel_size` instead\nsequence_parallel_degree: int | None\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\ncontext_parallel_size: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.\ntensor_parallel_size: int | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n  # For SpecialTokensConfig:\n  bos_token: str | None\n  eos_token: str | None\n  pad_token: str | None\n  unk_token: str | None\n  additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch&gt;=2.6.0\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Whether to checkpoint a model after the first step of training. Defaults to False.\nsave_first_step: bool | None\n\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# Which step to start the profiler at. Useful for only capturing a few steps mid-run.\nprofiler_steps_start: int | None = 0\n# bool of whether to report tokens per second at the end of training. This is not\n# supported with pre-training datasets.\ninclude_tokens_per_second: bool | None\n# bool of whether to report tokens per second per-gpu during training by measuring\n# throughput of non-padding tokens.\ninclude_tkps: bool | None = True\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template or path to jinja file for chat template. This will be only used\n# if chat_template is set to `jinja` or `null` (in which case chat_template is\n# automatically set to `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\n# Token index or indices to adjust embedding weights to the mean of the other tokens.\n# This is useful when the model has untrained embeddings.\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n# Enable sample generation during training for monitoring\ngenerate_samples: bool | None = False\n# Number of samples to generate at each interval\nnum_generation_samples: int | None = 3\n# Maximum new tokens to generate per sample\ngeneration_max_new_tokens: int | None = 50\n# Temperature for sample generation (0.0 = greedy)\ngeneration_temperature: float | None = 0.7\n# Nucleus sampling parameter for generation\ngeneration_top_p: float | None\n# Top-k sampling parameter for generation\ngeneration_top_k: int | None\n# Ratio of input to use as prompt (0.0-1.0)\ngeneration_prompt_ratio: float | None = 0.5\n# Whether to use sampling (vs greedy decoding)\ngeneration_do_sample: bool | None = True\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\n# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to\n# AutoConfig.\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Whether to save jinja files for tokenizer, transformers default is True\ntokenizer_save_jinja_files: bool | None = True\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Don't move the model to the device before sharding. Set to `false` to revert to legacy\n# behavior.\nexperimental_skip_move_to_device: bool | None = True\n\n# Use custom kernels, e.g. MegaBlocks.\nuse_kernels: bool | None\n\n# Model loading quantization config\nmodel_quantization_config: Literal['Mxfp4Config'] | None\n# kwargs for model quantization config\nmodel_quantization_config_kwargs: dict[str, Any] | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# branch/revision to push to on hub (default: main)\nhub_revision: str | None\n# Whether to save the model using safetensors format. Defaults to True.\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all\n# parameters in original model\nadapter: Literal['lora', 'qlora', 'llama-adapter'] | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\nlora_target_parameters: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n  # For PeftConfig:\n  # Configuration options for loftq initialization for LoRA\n  loftq_config: LoftQConfig | None\n    # For LoftQConfig:\n    # typically 4 bits\n    loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict\n# mapping an embedding layer name to its trainable token indices. See\n# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-\n# tokens-alongside-lora\npeft_trainable_token_indices: list[int] | dict[str, list[int]] | None\n# Whether to tie adapter weights for tied model weights. See\n# https://github.com/huggingface/peft/issues/2864\npeft_ensure_weight_tying: bool | None\n# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.\npeft_autocast_adapter_dtype: bool | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n# Method to use for LoRA merging. 'memory_efficient' (default) processes shards\n# individually to reduce memory usage, 'legacy' loads the full model into memory.\nmerge_method: Literal['legacy', 'memory_efficient'] | None = memory_efficient\n\n# Whether to use ReLoRA. Use with jagged_restart_*steps options.\nrelora: bool | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# how often to reset for jagged restarts\njagged_restart_steps: int | None\n# how many warmup steps to take after reset for jagged restarts\njagged_restart_warmup_steps: int | None\n# how many anneal steps to take before reset for jagged restarts\njagged_restart_anneal_steps: int | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n  # For LrGroup:\n  name: str (required)\n  modules: list[str] (required)\n  lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n\n# Dion Optimizer learning rate\ndion_lr: float | None\n# Dion Optimizer momentum\ndion_momentum: float | None\n# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank\n# dimension.\ndion_rank_fraction: float | None = 1.0\n# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may\n# be useful to ensure even sharding.\ndion_rank_multiple_of: int | None = 1\n\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\nuse_trackio: bool | None\n# Your trackio project name\ntrackio_project_name: str | None\n# Set the name of your trackio run\ntrackio_run_name: str | None\n# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)\ntrackio_space_id: str | None\n\n# Enable OpenTelemetry metrics collection and Prometheus export\nuse_otel_metrics: bool | None = False\n# Host to bind the OpenTelemetry metrics server to\notel_metrics_host: str | None = localhost\n# Port for the Prometheus metrics HTTP server\notel_metrics_port: int | None = 8000\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None\neval_table_size: int | None\neval_max_new_tokens: int | None\ndpo_use_logits_to_keep: bool | None\ndpo_generate_during_eval: bool | None\ndpo_norm_loss: bool | None\nrpo_alpha: float | None",
     "crumbs": [
       "Getting Started",
       "Config Reference"
diff --git a/sitemap.xml b/sitemap.xml
index 14ccda66f..9197cc564 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,942 +2,942 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://docs.axolotl.ai/FAQS.html</loc>
-    <lastmod>2026-03-31T23:17:02.880Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.703Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/template_free.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/conversation.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/pretraining.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/index.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.args.html</loc>
-    <lastmod>2026-03-31T23:20:15.639Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.788Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
-    <lastmod>2026-03-31T23:20:16.136Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.267Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.preprocess.html</loc>
-    <lastmod>2026-03-31T23:20:15.731Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.876Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.core.html</loc>
-    <lastmod>2026-03-31T23:20:16.993Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.097Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
-    <lastmod>2026-03-31T23:20:16.171Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.300Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
-    <lastmod>2026-03-31T23:20:16.750Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.862Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.lora.html</loc>
-    <lastmod>2026-03-31T23:20:16.481Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.599Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
-    <lastmod>2026-03-31T23:20:16.990Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.094Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.relora.html</loc>
-    <lastmod>2026-03-31T23:20:16.349Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.472Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.base.html</loc>
-    <lastmod>2026-03-31T23:20:15.476Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.624Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
-    <lastmod>2026-03-31T23:20:16.118Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.248Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
-    <lastmod>2026-03-31T23:20:16.963Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.068Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
-    <lastmod>2026-03-31T23:20:15.696Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.843Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html</loc>
-    <lastmod>2026-03-31T23:20:16.464Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.583Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
-    <lastmod>2026-03-31T23:20:15.545Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.696Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
-    <lastmod>2026-03-31T23:20:15.538Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.690Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
-    <lastmod>2026-03-31T23:20:15.468Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.610Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
-    <lastmod>2026-03-31T23:20:16.042Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.176Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
-    <lastmod>2026-03-31T23:20:17.019Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.124Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
-    <lastmod>2026-03-31T23:20:15.672Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.821Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.model.html</loc>
-    <lastmod>2026-03-31T23:20:15.908Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.046Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
-    <lastmod>2026-03-31T23:20:16.210Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.338Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.quantize.html</loc>
-    <lastmod>2026-03-31T23:20:15.738Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.882Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
-    <lastmod>2026-03-31T23:20:16.243Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.370Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
-    <lastmod>2026-03-31T23:20:16.968Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.072Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html</loc>
-    <lastmod>2026-03-31T23:20:16.150Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.280Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
-    <lastmod>2026-03-31T23:20:17.084Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.186Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
-    <lastmod>2026-03-31T23:20:16.384Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.506Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
-    <lastmod>2026-03-31T23:20:16.425Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.546Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.patch_manager.html</loc>
-    <lastmod>2026-03-31T23:20:15.949Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.086Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
-    <lastmod>2026-03-31T23:20:16.487Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.606Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
-    <lastmod>2026-03-31T23:20:16.719Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.832Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
-    <lastmod>2026-03-31T23:20:17.089Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.190Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/convert.html</loc>
-    <lastmod>2026-03-31T23:20:15.405Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.547Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
-    <lastmod>2026-03-31T23:20:15.760Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.904Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
-    <lastmod>2026-03-31T23:20:16.290Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.415Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
-    <lastmod>2026-03-31T23:20:16.391Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.512Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
-    <lastmod>2026-03-31T23:20:16.971Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.076Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
-    <lastmod>2026-03-31T23:20:16.502Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.620Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
-    <lastmod>2026-03-31T23:20:16.757Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.869Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html</loc>
-    <lastmod>2026-03-31T23:20:17.108Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.209Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
-    <lastmod>2026-03-31T23:20:16.612Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.728Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
-    <lastmod>2026-03-31T23:20:16.341Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.464Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html</loc>
-    <lastmod>2026-03-31T23:20:15.893Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.033Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
-    <lastmod>2026-03-31T23:20:15.533Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.685Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
-    <lastmod>2026-03-31T23:20:15.853Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.992Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
-    <lastmod>2026-03-31T23:20:16.190Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.318Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
-    <lastmod>2026-03-31T23:20:16.315Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.439Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
-    <lastmod>2026-03-31T23:20:16.145Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.275Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
-    <lastmod>2026-03-31T23:20:16.708Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.822Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.trl.html</loc>
-    <lastmod>2026-03-31T23:20:16.713Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.826Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html</loc>
-    <lastmod>2026-03-31T23:20:16.110Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.241Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
-    <lastmod>2026-03-31T23:20:15.746Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.890Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
-    <lastmod>2026-03-31T23:20:16.523Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.641Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
-    <lastmod>2026-03-31T23:20:15.999Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.134Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.training_args.html</loc>
-    <lastmod>2026-03-31T23:20:15.504Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.656Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/evaluate.html</loc>
-    <lastmod>2026-03-31T23:20:15.378Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.523Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html</loc>
-    <lastmod>2026-03-31T23:20:17.099Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.201Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.tokenizer.html</loc>
-    <lastmod>2026-03-31T23:20:15.919Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.056Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
-    <lastmod>2026-03-31T23:20:16.339Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.462Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
-    <lastmod>2026-03-31T23:20:15.758Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.902Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html</loc>
-    <lastmod>2026-03-31T23:20:16.123Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.254Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
-    <lastmod>2026-03-31T23:20:16.393Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.514Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html</loc>
-    <lastmod>2026-03-31T23:20:15.536Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.688Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.quantization.html</loc>
-    <lastmod>2026-03-31T23:20:16.637Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.752Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html</loc>
-    <lastmod>2026-03-31T23:20:16.414Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.535Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
-    <lastmod>2026-03-31T23:20:16.238Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.365Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.art.html</loc>
-    <lastmod>2026-03-31T23:20:15.644Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.792Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.processor.html</loc>
-    <lastmod>2026-03-31T23:20:15.921Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.058Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
-    <lastmod>2026-03-31T23:20:15.721Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.866Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.quantize.html</loc>
-    <lastmod>2026-03-31T23:20:16.330Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.453Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
-    <lastmod>2026-03-31T23:20:15.896Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.035Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
-    <lastmod>2026-03-31T23:20:16.158Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.288Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html</loc>
-    <lastmod>2026-03-31T23:20:15.678Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.826Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/faq.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/expert_quantization.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/optimizations.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.709Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.708Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/nd_parallelism.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.709Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/mac.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.708Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/reward_modelling.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.709Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/ministral3.html</loc>
-    <lastmod>2026-03-31T23:20:39.829Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.247Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/hunyuan.html</loc>
-    <lastmod>2026-03-31T23:20:39.838Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.256Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/smolvlm2.html</loc>
-    <lastmod>2026-03-31T23:20:39.836Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.255Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/ministral3/vision.html</loc>
-    <lastmod>2026-03-31T23:20:39.830Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.248Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/voxtral.html</loc>
-    <lastmod>2026-03-31T23:20:39.832Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.250Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/ministral.html</loc>
-    <lastmod>2026-03-31T23:20:39.831Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.250Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/granite4.html</loc>
-    <lastmod>2026-03-31T23:20:39.837Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.255Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/phi.html</loc>
-    <lastmod>2026-03-31T23:20:39.836Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.255Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/internvl3_5.html</loc>
-    <lastmod>2026-03-31T23:20:39.827Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.245Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/magistral/think.html</loc>
-    <lastmod>2026-03-31T23:20:39.831Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.249Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/mistral-small.html</loc>
-    <lastmod>2026-03-31T23:20:39.832Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.250Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/gemma3n.html</loc>
-    <lastmod>2026-03-31T23:20:39.835Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.253Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/arcee.html</loc>
-    <lastmod>2026-03-31T23:20:39.828Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.246Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/llama-2.html</loc>
-    <lastmod>2026-03-31T23:20:39.833Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.252Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/llama-4.html</loc>
-    <lastmod>2026-03-31T23:20:39.833Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.252Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/seed-oss.html</loc>
-    <lastmod>2026-03-31T23:20:39.836Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.255Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/jamba.html</loc>
-    <lastmod>2026-03-31T23:20:39.838Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.257Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/nccl.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.709Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/debugging.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset_preprocessing.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/ray-integration.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.709Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
-    <lastmod>2026-03-31T23:17:02.886Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.710Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.708Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/streaming.html</loc>
-    <lastmod>2026-03-31T23:17:02.886Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.710Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/amd_hpc.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/installation.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.708Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/inference.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.708Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/telemetry.html</loc>
-    <lastmod>2026-03-31T23:17:02.886Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.710Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
-    <lastmod>2026-03-31T23:17:02.917Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.738Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/index.html</loc>
-    <lastmod>2026-03-31T23:17:02.907Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.731Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html</loc>
-    <lastmod>2026-03-31T23:17:02.890Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.714Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html</loc>
-    <lastmod>2026-03-31T23:17:02.916Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.737Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/batch_vs_grad.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/sequence_parallelism.html</loc>
-    <lastmod>2026-03-31T23:17:02.886Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.710Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/quantize.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.709Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/docker.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/attention.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/unsloth.html</loc>
-    <lastmod>2026-03-31T23:17:02.886Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.710Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/qat.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.709Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multi-node.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.708Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/optimizers.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.709Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/config-reference.html</loc>
-    <lastmod>2026-03-31T23:20:38.941Z</lastmod>
+    <lastmod>2026-04-01T17:33:51.225Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/gradient_checkpointing.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multipack.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.709Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/LiquidAI.html</loc>
-    <lastmod>2026-03-31T23:20:39.837Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.256Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/magistral.html</loc>
-    <lastmod>2026-03-31T23:20:39.830Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.249Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/devstral.html</loc>
-    <lastmod>2026-03-31T23:20:39.833Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.251Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/qwen3-next.html</loc>
-    <lastmod>2026-03-31T23:20:39.834Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.253Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/mistral.html</loc>
-    <lastmod>2026-03-31T23:20:39.833Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.251Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/plano.html</loc>
-    <lastmod>2026-03-31T23:20:39.826Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.243Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/olmo3.html</loc>
-    <lastmod>2026-03-31T23:20:39.827Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.245Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/magistral/vision.html</loc>
-    <lastmod>2026-03-31T23:20:39.831Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.249Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/mimo.html</loc>
-    <lastmod>2026-03-31T23:20:39.827Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.245Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/index.html</loc>
-    <lastmod>2026-03-31T23:20:39.839Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.257Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/trinity.html</loc>
-    <lastmod>2026-03-31T23:20:39.828Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.246Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/kimi-linear.html</loc>
-    <lastmod>2026-03-31T23:20:39.826Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.243Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/orpheus.html</loc>
-    <lastmod>2026-03-31T23:20:39.838Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.257Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/qwen3.html</loc>
-    <lastmod>2026-03-31T23:20:39.834Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.253Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/ministral3/think.html</loc>
-    <lastmod>2026-03-31T23:20:39.829Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.247Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/apertus.html</loc>
-    <lastmod>2026-03-31T23:20:39.835Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.254Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/gpt-oss.html</loc>
-    <lastmod>2026-03-31T23:20:39.835Z</lastmod>
+    <lastmod>2026-04-01T17:33:52.254Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/mixed_precision.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.708Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/lora_optims.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.708Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/input_output.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.708Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/fsdp_qlora.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/checkpoint_saving.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/cli.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
-    <lastmod>2026-03-31T23:20:17.095Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.197Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
-    <lastmod>2026-03-31T23:20:16.991Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.095Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html</loc>
-    <lastmod>2026-03-31T23:20:15.862Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.001Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.fetch.html</loc>
-    <lastmod>2026-03-31T23:20:15.781Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.924Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.causal.html</loc>
-    <lastmod>2026-03-31T23:20:15.482Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.635Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.rl.html</loc>
-    <lastmod>2026-03-31T23:20:15.488Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.642Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
-    <lastmod>2026-03-31T23:20:16.492Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.611Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
-    <lastmod>2026-03-31T23:20:16.212Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.340Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
-    <lastmod>2026-03-31T23:20:16.062Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.195Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
-    <lastmod>2026-03-31T23:20:16.060Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.193Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
-    <lastmod>2026-03-31T23:20:17.026Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.130Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
-    <lastmod>2026-03-31T23:20:16.562Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.679Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
-    <lastmod>2026-03-31T23:20:16.332Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.455Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html</loc>
-    <lastmod>2026-03-31T23:20:15.535Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.686Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.constants.html</loc>
-    <lastmod>2026-03-31T23:20:15.951Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.088Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.model.html</loc>
-    <lastmod>2026-03-31T23:20:16.664Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.779Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
-    <lastmod>2026-03-31T23:20:16.946Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.051Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.load.html</loc>
-    <lastmod>2026-03-31T23:20:15.788Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.930Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.adapter.html</loc>
-    <lastmod>2026-03-31T23:20:15.927Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.065Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
-    <lastmod>2026-03-31T23:20:15.605Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.755Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
-    <lastmod>2026-03-31T23:20:16.400Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.521Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
-    <lastmod>2026-03-31T23:20:15.651Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
-    <lastmod>2026-03-31T23:20:16.188Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.317Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
-    <lastmod>2026-03-31T23:20:16.103Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.234Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
-    <lastmod>2026-03-31T23:20:15.846Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.986Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
-    <lastmod>2026-03-31T23:20:16.343Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.466Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
-    <lastmod>2026-03-31T23:20:15.970Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.106Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
-    <lastmod>2026-03-31T23:20:15.879Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.019Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.merge_lora.html</loc>
-    <lastmod>2026-03-31T23:20:15.707Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.853Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
-    <lastmod>2026-03-31T23:20:15.385Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.531Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.training.html</loc>
-    <lastmod>2026-03-31T23:20:16.673Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.788Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.distributed.html</loc>
-    <lastmod>2026-03-31T23:20:16.587Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.703Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
-    <lastmod>2026-03-31T23:20:15.750Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.894Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
-    <lastmod>2026-03-31T23:20:16.303Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.427Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html</loc>
-    <lastmod>2026-03-31T23:20:15.958Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.094Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
-    <lastmod>2026-03-31T23:20:15.288Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.438Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.base.html</loc>
-    <lastmod>2026-03-31T23:20:16.001Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.136Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.evaluate.html</loc>
-    <lastmod>2026-03-31T23:20:15.615Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.764Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
-    <lastmod>2026-03-31T23:20:15.365Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.511Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.architectures.html</loc>
-    <lastmod>2026-03-31T23:20:16.970Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.074Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html</loc>
-    <lastmod>2026-03-31T23:20:16.200Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.328Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html</loc>
-    <lastmod>2026-03-31T23:20:17.091Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.192Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.train.html</loc>
-    <lastmod>2026-03-31T23:20:15.810Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.951Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.liger.args.html</loc>
-    <lastmod>2026-03-31T23:20:16.959Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.064Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_tokenizers.html</loc>
-    <lastmod>2026-03-31T23:20:15.457Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.597Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.sweeps.html</loc>
-    <lastmod>2026-03-31T23:20:15.795Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.937Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.args.html</loc>
-    <lastmod>2026-03-31T23:20:15.775Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.917Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
-    <lastmod>2026-03-31T23:20:16.474Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.594Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
-    <lastmod>2026-03-31T23:20:16.656Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.771Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
-    <lastmod>2026-03-31T23:20:16.087Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.219Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html</loc>
-    <lastmod>2026-03-31T23:20:16.698Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.811Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
-    <lastmod>2026-03-31T23:20:16.939Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.045Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
-    <lastmod>2026-03-31T23:20:16.473Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.592Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
-    <lastmod>2026-03-31T23:20:16.344Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.468Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
-    <lastmod>2026-03-31T23:20:16.955Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.060Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
-    <lastmod>2026-03-31T23:20:16.427Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.548Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
-    <lastmod>2026-03-31T23:20:15.828Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.969Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html</loc>
-    <lastmod>2026-03-31T23:20:16.739Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.851Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html</loc>
-    <lastmod>2026-03-31T23:20:15.962Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.098Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
-    <lastmod>2026-03-31T23:20:15.595Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.745Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
-    <lastmod>2026-03-31T23:20:16.404Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.525Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html</loc>
-    <lastmod>2026-03-31T23:20:15.554Z</lastmod>
+    <lastmod>2026-04-01T17:33:27.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
-    <lastmod>2026-03-31T23:20:16.077Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.209Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
-    <lastmod>2026-03-31T23:20:16.944Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.050Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html</loc>
-    <lastmod>2026-03-31T23:20:16.412Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.533Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.data.streaming.html</loc>
-    <lastmod>2026-03-31T23:20:16.605Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.721Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.batching.html</loc>
-    <lastmod>2026-03-31T23:20:17.016Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.119Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
-    <lastmod>2026-03-31T23:20:17.076Z</lastmod>
+    <lastmod>2026-04-01T17:33:29.178Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
-    <lastmod>2026-03-31T23:20:16.184Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.313Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
-    <lastmod>2026-03-31T23:20:16.593Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.710Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
-    <lastmod>2026-03-31T23:20:16.186Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.315Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
-    <lastmod>2026-03-31T23:20:16.603Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.719Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html</loc>
-    <lastmod>2026-03-31T23:20:16.132Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.262Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html</loc>
-    <lastmod>2026-03-31T23:20:16.431Z</lastmod>
+    <lastmod>2026-04-01T17:33:28.552Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/rlhf.html</loc>
-    <lastmod>2026-03-31T23:17:02.885Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.709Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/tokenized.html</loc>
-    <lastmod>2026-03-31T23:17:02.882Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multimodal.html</loc>
-    <lastmod>2026-03-31T23:17:02.884Z</lastmod>
+    <lastmod>2026-04-01T17:30:09.709Z</lastmod>
   </url>
 </urlset>