From f18c2bb1f8a1d1563e00da0d2253f3c45aee60a2 Mon Sep 17 00:00:00 2001
From: Quarto GHA Workflow Runner <quarto-github-actions-publish@example.com>
Date: Tue, 21 Apr 2026 14:23:11 +0000
Subject: [PATCH] Built site for gh-pages

---
 .nojekyll                                     |    2 +-
 FAQS.html                                     |    6 -
 docs/agents/grpo.html                         |    6 -
 docs/agents/model_architectures.html          |    6 -
 docs/agents/new_model_support.html            |    6 -
 docs/agents/preference_tuning.html            |    6 -
 docs/agents/pretraining.html                  |    6 -
 docs/agents/reward_modelling.html             |    6 -
 docs/agents/sft.html                          |    6 -
 docs/amd_hpc.html                             |    6 -
 docs/api/cli.args.html                        |    6 -
 docs/api/cli.art.html                         |    6 -
 docs/api/cli.checks.html                      |    6 -
 docs/api/cli.cloud.base.html                  |    6 -
 docs/api/cli.cloud.modal_.html                |    6 -
 docs/api/cli.config.html                      |    6 -
 docs/api/cli.delinearize_llama4.html          |    6 -
 docs/api/cli.evaluate.html                    |    6 -
 docs/api/cli.inference.html                   |    6 -
 docs/api/cli.main.html                        |    6 -
 docs/api/cli.merge_lora.html                  |    6 -
 docs/api/cli.merge_sharded_fsdp_weights.html  |    6 -
 docs/api/cli.preprocess.html                  |    6 -
 docs/api/cli.quantize.html                    |    6 -
 docs/api/cli.train.html                       |    6 -
 docs/api/cli.utils.args.html                  |    6 -
 docs/api/cli.utils.fetch.html                 |    6 -
 docs/api/cli.utils.html                       |    6 -
 docs/api/cli.utils.load.html                  |    6 -
 docs/api/cli.utils.sweeps.html                |    6 -
 docs/api/cli.utils.train.html                 |    6 -
 docs/api/cli.vllm_serve.html                  |    6 -
 docs/api/common.architectures.html            |    6 -
 docs/api/common.const.html                    |    6 -
 docs/api/common.datasets.html                 |    6 -
 docs/api/convert.html                         |    6 -
 docs/api/core.builders.base.html              |    6 -
 docs/api/core.builders.causal.html            |    6 -
 docs/api/core.builders.rl.html                |    6 -
 docs/api/core.chat.format.chatml.html         |    6 -
 docs/api/core.chat.format.llama3x.html        |    6 -
 docs/api/core.chat.format.shared.html         |    6 -
 docs/api/core.chat.messages.html              |    6 -
 docs/api/core.datasets.chat.html              |    6 -
 ...core.datasets.transforms.chat_builder.html |    6 -
 docs/api/core.trainers.base.html              |    6 -
 docs/api/core.trainers.dpo.trainer.html       |    6 -
 docs/api/core.trainers.grpo.sampler.html      |    6 -
 docs/api/core.trainers.grpo.trainer.html      |    6 -
 docs/api/core.trainers.mamba.html             |    6 -
 docs/api/core.trainers.mixins.optimizer.html  |    6 -
 ...core.trainers.mixins.rng_state_loader.html |    6 -
 docs/api/core.trainers.mixins.scheduler.html  |    6 -
 docs/api/core.trainers.trl.html               |    6 -
 docs/api/core.trainers.utils.html             |    6 -
 docs/api/core.training_args.html              |    6 -
 docs/api/datasets.html                        |    6 -
 docs/api/evaluate.html                        |    6 -
 docs/api/index.html                           |   16 +-
 docs/api/integrations.base.html               |    6 -
 .../integrations.cut_cross_entropy.args.html  |    6 -
 docs/api/integrations.grokfast.optimizer.html |    6 -
 docs/api/integrations.kd.trainer.html         |    6 -
 docs/api/integrations.liger.args.html         |    6 -
 docs/api/integrations.lm_eval.args.html       |    6 -
 docs/api/integrations.spectrum.args.html      |    6 -
 docs/api/kernels.geglu.html                   |    6 -
 docs/api/kernels.lora.html                    |    6 -
 docs/api/kernels.quantize.html                |    6 -
 docs/api/kernels.swiglu.html                  |    6 -
 docs/api/kernels.utils.html                   |    6 -
 docs/api/loaders.adapter.html                 |    6 -
 docs/api/loaders.constants.html               |    6 -
 docs/api/loaders.model.html                   |    6 -
 docs/api/loaders.patch_manager.html           |    6 -
 docs/api/loaders.processor.html               |    6 -
 docs/api/loaders.tokenizer.html               |    6 -
 docs/api/logging_config.html                  |    6 -
 docs/api/models.mamba.modeling_mamba.html     |    6 -
 .../monkeypatch.btlm_attn_hijack_flash.html   |    6 -
 ...onkeypatch.data.batch_dataset_fetcher.html |    6 -
 ...ch.gradient_checkpointing.offload_cpu.html |    6 -
 ...h.gradient_checkpointing.offload_disk.html |    6 -
 .../monkeypatch.llama_attn_hijack_flash.html  |    6 -
 ...onkeypatch.llama_attn_hijack_xformers.html |    6 -
 docs/api/monkeypatch.lora_kernels.html        |    6 -
 ...monkeypatch.mistral_attn_hijack_flash.html |    6 -
 docs/api/monkeypatch.mixtral.html             |    6 -
 docs/api/monkeypatch.multipack.html           |    6 -
 docs/api/monkeypatch.relora.html              |    6 -
 ...onkeypatch.stablelm_attn_hijack_flash.html |    6 -
 docs/api/monkeypatch.trainer_fsdp_optim.html  |    6 -
 .../monkeypatch.transformers_fa_utils.html    |    6 -
 docs/api/monkeypatch.unsloth_.html            | 1178 ---------
 docs/api/monkeypatch.utils.html               |    6 -
 docs/api/prompt_strategies.alpaca_chat.html   |    6 -
 .../prompt_strategies.alpaca_instruct.html    |    6 -
 .../prompt_strategies.alpaca_w_system.html    |    6 -
 docs/api/prompt_strategies.base.html          |    6 -
 ...rompt_strategies.bradley_terry.llama3.html |    6 -
 docs/api/prompt_strategies.chat_template.html |    6 -
 docs/api/prompt_strategies.completion.html    |    6 -
 .../prompt_strategies.dpo.chat_template.html  |    6 -
 docs/api/prompt_strategies.dpo.chatml.html    |    6 -
 docs/api/prompt_strategies.dpo.llama3.html    |    6 -
 .../prompt_strategies.dpo.passthrough.html    |    6 -
 .../prompt_strategies.dpo.user_defined.html   |    6 -
 docs/api/prompt_strategies.dpo.zephyr.html    |    6 -
 docs/api/prompt_strategies.input_output.html  |    6 -
 docs/api/prompt_strategies.kto.chatml.html    |    6 -
 docs/api/prompt_strategies.kto.llama3.html    |    6 -
 .../prompt_strategies.kto.user_defined.html   |    6 -
 docs/api/prompt_strategies.llama2_chat.html   |    6 -
 docs/api/prompt_strategies.messages.chat.html |    6 -
 docs/api/prompt_strategies.metharme.html      |    6 -
 docs/api/prompt_strategies.orcamini.html      |    6 -
 .../prompt_strategies.orpo.chat_template.html |    6 -
 docs/api/prompt_strategies.pygmalion.html     |    6 -
 ...prompt_strategies.stepwise_supervised.html |    6 -
 docs/api/prompt_strategies.user_defined.html  |    6 -
 docs/api/prompt_tokenizers.html               |    6 -
 docs/api/train.html                           |    6 -
 docs/api/utils.bench.html                     |    6 -
 docs/api/utils.callbacks.comet_.html          |    6 -
 docs/api/utils.callbacks.lisa.html            |    6 -
 docs/api/utils.callbacks.mlflow_.html         |    6 -
 docs/api/utils.callbacks.perplexity.html      |    6 -
 docs/api/utils.callbacks.profiler.html        |    6 -
 docs/api/utils.callbacks.qat.html             |    6 -
 docs/api/utils.chat_templates.html            |    6 -
 docs/api/utils.collators.batching.html        |    6 -
 docs/api/utils.collators.core.html            |    6 -
 docs/api/utils.collators.mamba.html           |    6 -
 docs/api/utils.collators.mm_chat.html         |    6 -
 .../utils.ctx_managers.sequence_parallel.html |    6 -
 docs/api/utils.data.sft.html                  |    6 -
 docs/api/utils.data.streaming.html            |    6 -
 docs/api/utils.dict.html                      |    6 -
 docs/api/utils.distributed.html               |    6 -
 docs/api/utils.freeze.html                    |    6 -
 docs/api/utils.lora.html                      |    6 -
 docs/api/utils.model_shard_quant.html         |    6 -
 docs/api/utils.optimizers.adopt.html          |    6 -
 docs/api/utils.quantization.html              |    6 -
 docs/api/utils.samplers.multipack.html        |    6 -
 docs/api/utils.schedulers.html                |    6 -
 docs/api/utils.schemas.config.html            |    6 -
 docs/api/utils.schemas.datasets.html          |    6 -
 docs/api/utils.schemas.enums.html             |    6 -
 docs/api/utils.schemas.integrations.html      |    6 -
 docs/api/utils.schemas.model.html             |    6 -
 docs/api/utils.schemas.multimodal.html        |    6 -
 docs/api/utils.schemas.peft.html              |    6 -
 docs/api/utils.schemas.training.html          |    6 -
 docs/api/utils.schemas.trl.html               |    6 -
 docs/api/utils.schemas.utils.html             |    6 -
 docs/api/utils.tokenization.html              |    6 -
 docs/api/utils.trainer.html                   |    6 -
 docs/attention.html                           |    6 -
 docs/batch_vs_grad.html                       |    6 -
 docs/checkpoint_saving.html                   |    6 -
 docs/choosing_method.html                     |    6 -
 docs/cli.html                                 |    6 -
 docs/config-reference.html                    | 1235 +++++-----
 docs/custom_integrations.html                 |    6 -
 docs/dataset-formats/conversation.html        |    6 -
 docs/dataset-formats/index.html               |    6 -
 docs/dataset-formats/inst_tune.html           |    6 -
 docs/dataset-formats/pretraining.html         |    6 -
 docs/dataset-formats/stepwise_supervised.html |    6 -
 docs/dataset-formats/template_free.html       |    6 -
 docs/dataset-formats/tokenized.html           |    6 -
 docs/dataset_loading.html                     |    6 -
 docs/dataset_preprocessing.html               |    6 -
 docs/debugging.html                           |   19 +-
 docs/docker.html                              |  133 +-
 docs/ebft.html                                |    6 -
 docs/expert_quantization.html                 |    6 -
 docs/faq.html                                 |    6 -
 docs/fsdp_qlora.html                          |    6 -
 docs/getting-started.html                     |    6 -
 docs/gradient_checkpointing.html              |    6 -
 docs/grpo.html                                |    6 -
 docs/inference.html                           |    6 -
 docs/input_output.html                        |    6 -
 docs/installation.html                        |  179 +-
 docs/lora_optims.html                         |    6 -
 docs/lr_groups.html                           |    6 -
 docs/mac.html                                 |    6 -
 docs/mixed_precision.html                     |    6 -
 docs/models/LiquidAI.html                     |   13 +-
 docs/models/apertus.html                      |   21 +-
 docs/models/arcee.html                        |   15 +-
 docs/models/devstral.html                     |    9 +-
 docs/models/gemma3n.html                      |   13 +-
 docs/models/gpt-oss.html                      |   11 +-
 docs/models/granite4.html                     |   15 +-
 docs/models/hunyuan.html                      |   15 +-
 docs/models/index.html                        |    6 -
 docs/models/internvl3_5.html                  |    8 +-
 docs/models/jamba.html                        |    6 -
 docs/models/kimi-linear.html                  |    6 -
 docs/models/llama-2.html                      |    6 -
 docs/models/llama-4.html                      |    6 -
 docs/models/magistral.html                    |    9 +-
 docs/models/magistral/think.html              |    6 -
 docs/models/magistral/vision.html             |    8 +-
 docs/models/mimo.html                         |    6 -
 docs/models/ministral.html                    |    6 -
 docs/models/ministral3.html                   |    8 +-
 docs/models/ministral3/think.html             |    6 -
 docs/models/ministral3/vision.html            |    8 +-
 docs/models/mistral-small.html                |    8 +-
 docs/models/mistral.html                      |    6 -
 docs/models/olmo3.html                        |    6 -
 docs/models/orpheus.html                      |    6 -
 docs/models/phi.html                          |    6 -
 docs/models/plano.html                        |    6 -
 docs/models/qwen3-next.html                   |    8 +-
 docs/models/qwen3.html                        |    6 -
 docs/models/seed-oss.html                     |   15 +-
 docs/models/smolvlm2.html                     |   11 +-
 docs/models/trinity.html                      |    6 -
 docs/models/voxtral.html                      |   13 +-
 docs/multi-gpu.html                           |    6 -
 docs/multi-node.html                          |    6 -
 docs/multimodal.html                          |    6 -
 docs/multipack.html                           |    6 -
 docs/nccl.html                                |    6 -
 docs/nd_parallelism.html                      |    6 -
 docs/optimizations.html                       |    6 -
 docs/optimizers.html                          |    6 -
 docs/qat.html                                 |    6 -
 docs/quantize.html                            |    6 -
 docs/ray-integration.html                     |    6 -
 docs/reward_modelling.html                    |    6 -
 docs/rlhf.html                                |    6 -
 docs/sequence_parallelism.html                |    6 -
 docs/streaming.html                           |    6 -
 docs/telemetry.html                           |    6 -
 docs/torchao.html                             |    6 -
 docs/training_stability.html                  |    6 -
 docs/unsloth.html                             | 1281 ----------
 docs/vllm_serving.html                        |    6 -
 .../colab-axolotl-example.html                |    6 -
 index.html                                    |  115 +-
 search.json                                   | 2151 ++++++++---------
 sitemap.xml                                   | 1424 ++++++-----
 src/axolotl/integrations/LICENSE.html         |    6 -
 .../cut_cross_entropy/ACKNOWLEDGEMENTS.html   |    6 -
 250 files changed, 2674 insertions(+), 6593 deletions(-)
 delete mode 100644 docs/api/monkeypatch.unsloth_.html
 delete mode 100644 docs/unsloth.html

diff --git a/.nojekyll b/.nojekyll
index 5361f5b47..d30a5b91b 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-11c88725
\ No newline at end of file
+1cf0992e
\ No newline at end of file
diff --git a/FAQS.html b/FAQS.html
index 85ac56908..455a991fb 100644
--- a/FAQS.html
+++ b/FAQS.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="./docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/agents/grpo.html b/docs/agents/grpo.html
index 2681152f4..9b26a4231 100644
--- a/docs/agents/grpo.html
+++ b/docs/agents/grpo.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/agents/model_architectures.html b/docs/agents/model_architectures.html
index b0373a116..241216422 100644
--- a/docs/agents/model_architectures.html
+++ b/docs/agents/model_architectures.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/agents/new_model_support.html b/docs/agents/new_model_support.html
index 671db5ade..0e969ab01 100644
--- a/docs/agents/new_model_support.html
+++ b/docs/agents/new_model_support.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/agents/preference_tuning.html b/docs/agents/preference_tuning.html
index 0c3381039..033910dbd 100644
--- a/docs/agents/preference_tuning.html
+++ b/docs/agents/preference_tuning.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/agents/pretraining.html b/docs/agents/pretraining.html
index fe2caf9aa..a07fce09e 100644
--- a/docs/agents/pretraining.html
+++ b/docs/agents/pretraining.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/agents/reward_modelling.html b/docs/agents/reward_modelling.html
index 1504a6349..d4babe887 100644
--- a/docs/agents/reward_modelling.html
+++ b/docs/agents/reward_modelling.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/agents/sft.html b/docs/agents/sft.html
index 4a6080cc2..9f7d18389 100644
--- a/docs/agents/sft.html
+++ b/docs/agents/sft.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/amd_hpc.html b/docs/amd_hpc.html
index 83d66b3b2..aafc33d48 100644
--- a/docs/amd_hpc.html
+++ b/docs/amd_hpc.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.args.html b/docs/api/cli.args.html
index 360de7c1e..ce24a26de 100644
--- a/docs/api/cli.args.html
+++ b/docs/api/cli.args.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.art.html b/docs/api/cli.art.html
index 9790aef65..1996b1b60 100644
--- a/docs/api/cli.art.html
+++ b/docs/api/cli.art.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.checks.html b/docs/api/cli.checks.html
index 9d2277f1b..d199c5ea0 100644
--- a/docs/api/cli.checks.html
+++ b/docs/api/cli.checks.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.cloud.base.html b/docs/api/cli.cloud.base.html
index 24fe7624c..564f2e7d7 100644
--- a/docs/api/cli.cloud.base.html
+++ b/docs/api/cli.cloud.base.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.cloud.modal_.html b/docs/api/cli.cloud.modal_.html
index c210733a3..fc3c25f45 100644
--- a/docs/api/cli.cloud.modal_.html
+++ b/docs/api/cli.cloud.modal_.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.config.html b/docs/api/cli.config.html
index 15fc4aefb..d9dc5e775 100644
--- a/docs/api/cli.config.html
+++ b/docs/api/cli.config.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.delinearize_llama4.html b/docs/api/cli.delinearize_llama4.html
index 7bf3c10a8..62527256d 100644
--- a/docs/api/cli.delinearize_llama4.html
+++ b/docs/api/cli.delinearize_llama4.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.evaluate.html b/docs/api/cli.evaluate.html
index d45db111b..10a4ad3f8 100644
--- a/docs/api/cli.evaluate.html
+++ b/docs/api/cli.evaluate.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.inference.html b/docs/api/cli.inference.html
index 1a1297816..6a278657d 100644
--- a/docs/api/cli.inference.html
+++ b/docs/api/cli.inference.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.main.html b/docs/api/cli.main.html
index d1e0bc563..b0fc7f9d9 100644
--- a/docs/api/cli.main.html
+++ b/docs/api/cli.main.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.merge_lora.html b/docs/api/cli.merge_lora.html
index c3c7ec5d5..3b1a098cb 100644
--- a/docs/api/cli.merge_lora.html
+++ b/docs/api/cli.merge_lora.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.merge_sharded_fsdp_weights.html b/docs/api/cli.merge_sharded_fsdp_weights.html
index 6a13f77bf..a6833ce04 100644
--- a/docs/api/cli.merge_sharded_fsdp_weights.html
+++ b/docs/api/cli.merge_sharded_fsdp_weights.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.preprocess.html b/docs/api/cli.preprocess.html
index 181bc1029..5f966ff26 100644
--- a/docs/api/cli.preprocess.html
+++ b/docs/api/cli.preprocess.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.quantize.html b/docs/api/cli.quantize.html
index c2cf00a91..436358b07 100644
--- a/docs/api/cli.quantize.html
+++ b/docs/api/cli.quantize.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.train.html b/docs/api/cli.train.html
index 6a6871ec5..906619b39 100644
--- a/docs/api/cli.train.html
+++ b/docs/api/cli.train.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.args.html b/docs/api/cli.utils.args.html
index 9d683903a..2eb1aec0a 100644
--- a/docs/api/cli.utils.args.html
+++ b/docs/api/cli.utils.args.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.fetch.html b/docs/api/cli.utils.fetch.html
index f0b0ba480..7811e3852 100644
--- a/docs/api/cli.utils.fetch.html
+++ b/docs/api/cli.utils.fetch.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.html b/docs/api/cli.utils.html
index aa0423cfc..758a86bba 100644
--- a/docs/api/cli.utils.html
+++ b/docs/api/cli.utils.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.load.html b/docs/api/cli.utils.load.html
index d27c2b954..4a3759d12 100644
--- a/docs/api/cli.utils.load.html
+++ b/docs/api/cli.utils.load.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.sweeps.html b/docs/api/cli.utils.sweeps.html
index bbcc9d3d3..68aefb0a4 100644
--- a/docs/api/cli.utils.sweeps.html
+++ b/docs/api/cli.utils.sweeps.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.train.html b/docs/api/cli.utils.train.html
index 187feac01..747fbf5fa 100644
--- a/docs/api/cli.utils.train.html
+++ b/docs/api/cli.utils.train.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.vllm_serve.html b/docs/api/cli.vllm_serve.html
index 8b022a820..6a952f15b 100644
--- a/docs/api/cli.vllm_serve.html
+++ b/docs/api/cli.vllm_serve.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/common.architectures.html b/docs/api/common.architectures.html
index 03cec8109..1651c9218 100644
--- a/docs/api/common.architectures.html
+++ b/docs/api/common.architectures.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/common.const.html b/docs/api/common.const.html
index 2b0620700..94a86302d 100644
--- a/docs/api/common.const.html
+++ b/docs/api/common.const.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/common.datasets.html b/docs/api/common.datasets.html
index 8a667affb..03815136b 100644
--- a/docs/api/common.datasets.html
+++ b/docs/api/common.datasets.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/convert.html b/docs/api/convert.html
index 27daf3fc8..c5cb0820d 100644
--- a/docs/api/convert.html
+++ b/docs/api/convert.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.builders.base.html b/docs/api/core.builders.base.html
index 121ee3825..0589c2fc1 100644
--- a/docs/api/core.builders.base.html
+++ b/docs/api/core.builders.base.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.builders.causal.html b/docs/api/core.builders.causal.html
index a04129309..befa3267d 100644
--- a/docs/api/core.builders.causal.html
+++ b/docs/api/core.builders.causal.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.builders.rl.html b/docs/api/core.builders.rl.html
index de583248e..f1a78d4b8 100644
--- a/docs/api/core.builders.rl.html
+++ b/docs/api/core.builders.rl.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.chat.format.chatml.html b/docs/api/core.chat.format.chatml.html
index 89a651ae3..c40910309 100644
--- a/docs/api/core.chat.format.chatml.html
+++ b/docs/api/core.chat.format.chatml.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.chat.format.llama3x.html b/docs/api/core.chat.format.llama3x.html
index e3784d346..f59defbe5 100644
--- a/docs/api/core.chat.format.llama3x.html
+++ b/docs/api/core.chat.format.llama3x.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.chat.format.shared.html b/docs/api/core.chat.format.shared.html
index 449575bee..b99bc3e5c 100644
--- a/docs/api/core.chat.format.shared.html
+++ b/docs/api/core.chat.format.shared.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.chat.messages.html b/docs/api/core.chat.messages.html
index 8bdd25961..1594b4de4 100644
--- a/docs/api/core.chat.messages.html
+++ b/docs/api/core.chat.messages.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.datasets.chat.html b/docs/api/core.datasets.chat.html
index 7ff985de0..69a63c652 100644
--- a/docs/api/core.datasets.chat.html
+++ b/docs/api/core.datasets.chat.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.datasets.transforms.chat_builder.html b/docs/api/core.datasets.transforms.chat_builder.html
index 023cf8267..2bdd19e5d 100644
--- a/docs/api/core.datasets.transforms.chat_builder.html
+++ b/docs/api/core.datasets.transforms.chat_builder.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.base.html b/docs/api/core.trainers.base.html
index 2f88a9997..7a87857cb 100644
--- a/docs/api/core.trainers.base.html
+++ b/docs/api/core.trainers.base.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.dpo.trainer.html b/docs/api/core.trainers.dpo.trainer.html
index 4aafef517..3f768a695 100644
--- a/docs/api/core.trainers.dpo.trainer.html
+++ b/docs/api/core.trainers.dpo.trainer.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.grpo.sampler.html b/docs/api/core.trainers.grpo.sampler.html
index 7355688e7..9a13bec21 100644
--- a/docs/api/core.trainers.grpo.sampler.html
+++ b/docs/api/core.trainers.grpo.sampler.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.grpo.trainer.html b/docs/api/core.trainers.grpo.trainer.html
index d233df546..44a9dec91 100644
--- a/docs/api/core.trainers.grpo.trainer.html
+++ b/docs/api/core.trainers.grpo.trainer.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.mamba.html b/docs/api/core.trainers.mamba.html
index 3d86dc3a7..54d681c9c 100644
--- a/docs/api/core.trainers.mamba.html
+++ b/docs/api/core.trainers.mamba.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.mixins.optimizer.html b/docs/api/core.trainers.mixins.optimizer.html
index fd52e7de4..b94221985 100644
--- a/docs/api/core.trainers.mixins.optimizer.html
+++ b/docs/api/core.trainers.mixins.optimizer.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.mixins.rng_state_loader.html b/docs/api/core.trainers.mixins.rng_state_loader.html
index 73567ec93..5676960f9 100644
--- a/docs/api/core.trainers.mixins.rng_state_loader.html
+++ b/docs/api/core.trainers.mixins.rng_state_loader.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.mixins.scheduler.html b/docs/api/core.trainers.mixins.scheduler.html
index 8bd41625d..3e108c55b 100644
--- a/docs/api/core.trainers.mixins.scheduler.html
+++ b/docs/api/core.trainers.mixins.scheduler.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.trl.html b/docs/api/core.trainers.trl.html
index d0a755e3c..aa6ca081c 100644
--- a/docs/api/core.trainers.trl.html
+++ b/docs/api/core.trainers.trl.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.utils.html b/docs/api/core.trainers.utils.html
index 14886cb60..124be2523 100644
--- a/docs/api/core.trainers.utils.html
+++ b/docs/api/core.trainers.utils.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.training_args.html b/docs/api/core.training_args.html
index 314015780..67d146d57 100644
--- a/docs/api/core.training_args.html
+++ b/docs/api/core.training_args.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/datasets.html b/docs/api/datasets.html
index 39107f342..6beec978c 100644
--- a/docs/api/datasets.html
+++ b/docs/api/datasets.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/evaluate.html b/docs/api/evaluate.html
index 14820c0ca..b809f5f4b 100644
--- a/docs/api/evaluate.html
+++ b/docs/api/evaluate.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/index.html b/docs/api/index.html
index b1591720e..80ebae651 100644
--- a/docs/api/index.html
+++ b/docs/api/index.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -1235,22 +1229,18 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <td>see https://github.com/huggingface/transformers/pull/35834</td>
 </tr>
 <tr class="even">
-<td><a href="../../docs/api/monkeypatch.unsloth_.html#axolotl.monkeypatch.unsloth_">monkeypatch.unsloth_</a></td>
-<td>module for patching with unsloth optimizations</td>
-</tr>
-<tr class="odd">
 <td><a href="../../docs/api/monkeypatch.data.batch_dataset_fetcher.html#axolotl.monkeypatch.data.batch_dataset_fetcher">monkeypatch.data.batch_dataset_fetcher</a></td>
 <td>Monkey patches for the dataset fetcher to handle batches of packed indexes.</td>
 </tr>
-<tr class="even">
+<tr class="odd">
 <td><a href="../../docs/api/monkeypatch.mixtral.html#axolotl.monkeypatch.mixtral">monkeypatch.mixtral</a></td>
 <td>Patches to support multipack for mixtral</td>
 </tr>
-<tr class="odd">
+<tr class="even">
 <td><a href="../../docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#axolotl.monkeypatch.gradient_checkpointing.offload_cpu">monkeypatch.gradient_checkpointing.offload_cpu</a></td>
 <td>CPU offloaded checkpointing</td>
 </tr>
-<tr class="even">
+<tr class="odd">
 <td><a href="../../docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#axolotl.monkeypatch.gradient_checkpointing.offload_disk">monkeypatch.gradient_checkpointing.offload_disk</a></td>
 <td>DISCO - DIsk-based Storage and Checkpointing with Optimized prefetching</td>
 </tr>
diff --git a/docs/api/integrations.base.html b/docs/api/integrations.base.html
index 626a4a0e6..88edc7796 100644
--- a/docs/api/integrations.base.html
+++ b/docs/api/integrations.base.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.cut_cross_entropy.args.html b/docs/api/integrations.cut_cross_entropy.args.html
index 8326e1253..98512a5c4 100644
--- a/docs/api/integrations.cut_cross_entropy.args.html
+++ b/docs/api/integrations.cut_cross_entropy.args.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.grokfast.optimizer.html b/docs/api/integrations.grokfast.optimizer.html
index e956b528a..1b0d28e97 100644
--- a/docs/api/integrations.grokfast.optimizer.html
+++ b/docs/api/integrations.grokfast.optimizer.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.kd.trainer.html b/docs/api/integrations.kd.trainer.html
index ac8a11c05..f03530545 100644
--- a/docs/api/integrations.kd.trainer.html
+++ b/docs/api/integrations.kd.trainer.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.liger.args.html b/docs/api/integrations.liger.args.html
index fdb8606c7..1dcfb9d98 100644
--- a/docs/api/integrations.liger.args.html
+++ b/docs/api/integrations.liger.args.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.lm_eval.args.html b/docs/api/integrations.lm_eval.args.html
index 3d72be59b..3bcb390c7 100644
--- a/docs/api/integrations.lm_eval.args.html
+++ b/docs/api/integrations.lm_eval.args.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.spectrum.args.html b/docs/api/integrations.spectrum.args.html
index 34e243720..a0186b920 100644
--- a/docs/api/integrations.spectrum.args.html
+++ b/docs/api/integrations.spectrum.args.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/kernels.geglu.html b/docs/api/kernels.geglu.html
index a3a613080..9b8cf5fc5 100644
--- a/docs/api/kernels.geglu.html
+++ b/docs/api/kernels.geglu.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/kernels.lora.html b/docs/api/kernels.lora.html
index 351391e21..6bc81139b 100644
--- a/docs/api/kernels.lora.html
+++ b/docs/api/kernels.lora.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/kernels.quantize.html b/docs/api/kernels.quantize.html
index 967edc147..a2cbf26e3 100644
--- a/docs/api/kernels.quantize.html
+++ b/docs/api/kernels.quantize.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/kernels.swiglu.html b/docs/api/kernels.swiglu.html
index d0bacc428..071605607 100644
--- a/docs/api/kernels.swiglu.html
+++ b/docs/api/kernels.swiglu.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/kernels.utils.html b/docs/api/kernels.utils.html
index 0ae8885eb..250fd9592 100644
--- a/docs/api/kernels.utils.html
+++ b/docs/api/kernels.utils.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.adapter.html b/docs/api/loaders.adapter.html
index a452cd10c..e2ce542f1 100644
--- a/docs/api/loaders.adapter.html
+++ b/docs/api/loaders.adapter.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.constants.html b/docs/api/loaders.constants.html
index 80773c313..50c76031b 100644
--- a/docs/api/loaders.constants.html
+++ b/docs/api/loaders.constants.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.model.html b/docs/api/loaders.model.html
index 87dd915cd..82238cdcc 100644
--- a/docs/api/loaders.model.html
+++ b/docs/api/loaders.model.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.patch_manager.html b/docs/api/loaders.patch_manager.html
index 095a4a213..51496475a 100644
--- a/docs/api/loaders.patch_manager.html
+++ b/docs/api/loaders.patch_manager.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.processor.html b/docs/api/loaders.processor.html
index 151712866..03aaae9a7 100644
--- a/docs/api/loaders.processor.html
+++ b/docs/api/loaders.processor.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.tokenizer.html b/docs/api/loaders.tokenizer.html
index 6ab3e0904..194277055 100644
--- a/docs/api/loaders.tokenizer.html
+++ b/docs/api/loaders.tokenizer.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/logging_config.html b/docs/api/logging_config.html
index 76673ad72..2046710a3 100644
--- a/docs/api/logging_config.html
+++ b/docs/api/logging_config.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/models.mamba.modeling_mamba.html b/docs/api/models.mamba.modeling_mamba.html
index 644951b85..55f3a7d26 100644
--- a/docs/api/models.mamba.modeling_mamba.html
+++ b/docs/api/models.mamba.modeling_mamba.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.btlm_attn_hijack_flash.html b/docs/api/monkeypatch.btlm_attn_hijack_flash.html
index ba8601973..657ed8bbf 100644
--- a/docs/api/monkeypatch.btlm_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.btlm_attn_hijack_flash.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.data.batch_dataset_fetcher.html b/docs/api/monkeypatch.data.batch_dataset_fetcher.html
index 0d285334f..0ee8fb678 100644
--- a/docs/api/monkeypatch.data.batch_dataset_fetcher.html
+++ b/docs/api/monkeypatch.data.batch_dataset_fetcher.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html b/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
index c3f9aec25..b79178fe0 100644
--- a/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
+++ b/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html b/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
index deb5aee15..04d6736c0 100644
--- a/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
+++ b/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.llama_attn_hijack_flash.html b/docs/api/monkeypatch.llama_attn_hijack_flash.html
index 8162cdf74..c04044558 100644
--- a/docs/api/monkeypatch.llama_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.llama_attn_hijack_flash.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.llama_attn_hijack_xformers.html b/docs/api/monkeypatch.llama_attn_hijack_xformers.html
index e272b89e7..e79fac067 100644
--- a/docs/api/monkeypatch.llama_attn_hijack_xformers.html
+++ b/docs/api/monkeypatch.llama_attn_hijack_xformers.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.lora_kernels.html b/docs/api/monkeypatch.lora_kernels.html
index 0ed8407ac..b33e6dbdb 100644
--- a/docs/api/monkeypatch.lora_kernels.html
+++ b/docs/api/monkeypatch.lora_kernels.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.mistral_attn_hijack_flash.html b/docs/api/monkeypatch.mistral_attn_hijack_flash.html
index 75e658aab..ec38f3830 100644
--- a/docs/api/monkeypatch.mistral_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.mistral_attn_hijack_flash.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.mixtral.html b/docs/api/monkeypatch.mixtral.html
index 5161a4ac4..f929fc66a 100644
--- a/docs/api/monkeypatch.mixtral.html
+++ b/docs/api/monkeypatch.mixtral.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.multipack.html b/docs/api/monkeypatch.multipack.html
index 45192896b..82ae2b744 100644
--- a/docs/api/monkeypatch.multipack.html
+++ b/docs/api/monkeypatch.multipack.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.relora.html b/docs/api/monkeypatch.relora.html
index 1b126188b..e2eecfb6a 100644
--- a/docs/api/monkeypatch.relora.html
+++ b/docs/api/monkeypatch.relora.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.stablelm_attn_hijack_flash.html b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
index fe3db4457..1c1eda79f 100644
--- a/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.trainer_fsdp_optim.html b/docs/api/monkeypatch.trainer_fsdp_optim.html
index 2fb07220c..b22d8e03a 100644
--- a/docs/api/monkeypatch.trainer_fsdp_optim.html
+++ b/docs/api/monkeypatch.trainer_fsdp_optim.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.transformers_fa_utils.html b/docs/api/monkeypatch.transformers_fa_utils.html
index 83cff2f68..8e211c2e1 100644
--- a/docs/api/monkeypatch.transformers_fa_utils.html
+++ b/docs/api/monkeypatch.transformers_fa_utils.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.unsloth_.html b/docs/api/monkeypatch.unsloth_.html
deleted file mode 100644
index 7d5a95bae..000000000
--- a/docs/api/monkeypatch.unsloth_.html
+++ /dev/null
@@ -1,1178 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
-
-<meta charset="utf-8">
-<meta name="generator" content="quarto-1.9.37">
-
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
-
-
-<title>monkeypatch.unsloth_ – Axolotl</title>
-<style>
-/* Default styles provided by pandoc.
-** See https://pandoc.org/MANUAL.html#variables-for-html for config info.
-*/
-code{white-space: pre-wrap;}
-span.smallcaps{font-variant: small-caps;}
-div.columns{display: flex; gap: min(4vw, 1.5em);}
-div.column{flex: auto; overflow-x: auto;}
-div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
-ul.task-list{list-style: none;}
-ul.task-list li input[type="checkbox"] {
-  width: 0.8em;
-  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
-  vertical-align: middle;
-}
-</style>
-
-
-<script src="../../site_libs/quarto-nav/quarto-nav.js"></script>
-<script src="../../site_libs/clipboard/clipboard.min.js"></script>
-<script src="../../site_libs/quarto-search/autocomplete.umd.js"></script>
-<script src="../../site_libs/quarto-search/fuse.min.js"></script>
-<script src="../../site_libs/quarto-search/quarto-search.js"></script>
-<meta name="quarto:offset" content="../../">
-<link href="../../favicon.jpg" rel="icon" type="image/jpeg">
-<script src="../../site_libs/quarto-html/quarto.js" type="module"></script>
-<script src="../../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
-<script src="../../site_libs/quarto-html/popper.min.js"></script>
-<script src="../../site_libs/quarto-html/tippy.umd.min.js"></script>
-<script src="../../site_libs/quarto-html/anchor.min.js"></script>
-<link href="../../site_libs/quarto-html/tippy.css" rel="stylesheet">
-<link href="../../site_libs/quarto-html/quarto-syntax-highlighting-dark-d0ae9245876894da5ac7e18953ecc5cc.css" rel="stylesheet" id="quarto-text-highlighting-styles">
-<script src="../../site_libs/bootstrap/bootstrap.min.js"></script>
-<link href="../../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
-<link href="../../site_libs/bootstrap/bootstrap-ab6ebd6eb475c4578b58908bc314f719.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
-<script id="quarto-search-options" type="application/json">{
-  "location": "navbar",
-  "copy-button": false,
-  "collapse-after": 3,
-  "panel-placement": "end",
-  "type": "overlay",
-  "limit": 50,
-  "keyboard-shortcut": [
-    "f",
-    "/",
-    "s"
-  ],
-  "show-item-context": false,
-  "language": {
-    "search-no-results-text": "No results",
-    "search-matching-documents-text": "matching documents",
-    "search-copy-link-title": "Copy link to search",
-    "search-hide-matches-text": "Hide additional matches",
-    "search-more-match-text": "more match in this document",
-    "search-more-matches-text": "more matches in this document",
-    "search-clear-button-title": "Clear",
-    "search-text-placeholder": "",
-    "search-detached-cancel-button-title": "Cancel",
-    "search-submit-button-title": "Submit",
-    "search-label": "Search"
-  }
-}</script>
-<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-9KYCVJBNMQ"></script>
-
-<script type="text/javascript">
-
-window.dataLayer = window.dataLayer || [];
-function gtag(){dataLayer.push(arguments);}
-gtag('js', new Date());
-gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
-</script>
-
-
-<link rel="stylesheet" href="../../styles.css">
-</head>
-
-<body class="nav-sidebar docked nav-fixed quarto-light">
-
-<div id="quarto-search-results"></div>
-  <header id="quarto-header" class="headroom fixed-top">
-    <nav class="navbar navbar-expand " data-bs-theme="dark">
-      <div class="navbar-container container-fluid">
-      <div class="navbar-brand-container mx-auto">
-    <a href="../../index.html" class="navbar-brand navbar-brand-logo">
-    <img src="../../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo light-content">
-    <img src="../../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo dark-content">
-    </a>
-  </div>
-        <div class="quarto-navbar-tools tools-wide tools-end">
-    <a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
-    <a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
-    <a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
-</div>
-          <div id="quarto-search" class="" title="Search"></div>
-      </div> <!-- /container-fluid -->
-    </nav>
-  <nav class="quarto-secondary-nav">
-    <div class="container-fluid d-flex">
-      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
-        <i class="bi bi-layout-text-sidebar-reverse"></i>
-      </button>
-        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"></ol></nav>
-        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
-        </a>
-    </div>
-  </nav>
-</header>
-<!-- content -->
-<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
-<!-- sidebar -->
-  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
-    <div class="sidebar-menu-container"> 
-    <ul class="list-unstyled mt-1">
-        <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../index.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Home</span></a>
-  </div>
-</li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
- <span class="menu-text">Getting Started</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/getting-started.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Quickstart</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/choosing_method.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Which Fine-Tuning Method Should I Use?</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/installation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Installation</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/inference.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Inference and Merging</span></a>
-  </div>
-</li>
-          <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false">
- <span class="menu-text">Model Guides</span></a>
-          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth2 ">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/kimi-linear.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Kimi Linear</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/plano.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Plano Orchestrator</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/mimo.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">MiMo</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/internvl3_5.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">InternVL 3.5</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/olmo3.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">OLMo 3</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/trinity.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Trinity</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/arcee.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Arcee AFM</span></a>
-  </div>
-</li>
-          <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false">
- <span class="menu-text">Ministral3</span></a>
-          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth3 ">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/ministral3.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Ministral3</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/ministral3/think.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Ministral 3 Thinking</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/ministral3/vision.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Ministral 3 Vision</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-          <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false">
- <span class="menu-text">Magistral</span></a>
-          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth3 ">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/magistral.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Magistral</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/magistral/think.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Magistral Thinking</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/magistral/vision.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Magistral Vision</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/ministral.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Ministral</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/mistral-small.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Mistral Small 3.1/3.2</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/voxtral.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Voxtral</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/devstral.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Devstral</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/mistral.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Mistral 7B</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/llama-4.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Llama 4</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/llama-2.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Llama 2</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/qwen3-next.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Qwen 3 Next</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/qwen3.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Qwen 3</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/gemma3n.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Gemma 3n</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/apertus.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Apertus</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/gpt-oss.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">GPT-OSS</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/seed-oss.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Seed-OSS</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/phi.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Phi</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/smolvlm2.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">SmolVLM 2</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/granite4.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Granite 4</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/LiquidAI.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Liquid Foundation Models 2</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/hunyuan.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Hunyuan</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/jamba.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Jamba</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/models/orpheus.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Orpheus</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/cli.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Command Line Interface (CLI)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/telemetry.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Telemetry</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/config-reference.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Config Reference</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/api" class="sidebar-item-text sidebar-link">
- <span class="menu-text">API Reference</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a href="../../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Dataset Formats</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Pre-training</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Instruction Tuning</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Conversation</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Stepwise Supervised Format</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Template-Free</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
- <span class="menu-text">Deployments</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/docker.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Docker</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/multi-gpu.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Multi-GPU</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/multi-node.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Multi Node</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Ray Train</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">AMD GPUs on HPC Systems</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/mac.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Mac M-series</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true">
- <span class="menu-text">How To Guides</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-7" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/multimodal.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">MultiModal / Vision Language Models (BETA)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/rlhf.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">RLHF (Beta)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/grpo.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">GRPO Training</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/ebft.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">EBFT Training</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/vllm_serving.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">vLLM Serving for GRPO Training</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/reward_modelling.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Reward Modelling</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/lr_groups.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Learning Rate Groups</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/lora_optims.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">LoRA Optimizations</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/dataset_loading.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Dataset Loading</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/qat.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Quantization Aware Training (QAT)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/quantize.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Quantization with torchao</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/optimizations.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Optimizations Guide</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true">
- <span class="menu-text">Core Concepts</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-8" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/batch_vs_grad.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Batch size vs Gradient accumulation</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Dataset Preprocessing</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Streaming Datasets</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/multipack.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Multipack (Sample Packing)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/mixed_precision.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Mixed Precision Training</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/optimizers.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Optimizers</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/attention.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Attention</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true">
- <span class="menu-text">Advanced Features</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-9" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">FSDP + QLoRA</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/torchao.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">PyTorch ao</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/custom_integrations.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Custom Integrations</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/sequence_parallelism.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Sequence Parallelism</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Gradient Checkpointing, Activation Offloading, and Layer Offloading</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">N-D Parallelism (Beta)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/expert_quantization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">MoE Expert Quantization</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true">
- <span class="menu-text">Troubleshooting</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-10" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/faq.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">FAQ</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/training_stability.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Training Stability &amp; Debugging</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/debugging.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Debugging</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/nccl.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">NCCL</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-    </ul>
-    </div>
-</nav>
-<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
-<!-- margin-sidebar -->
-    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
-        <nav id="TOC" role="doc-toc" class="toc-active">
-    <h2 id="toc-title">On this page</h2>
-   
-  <ul>
-  <li><a href="#axolotl.monkeypatch.unsloth_" id="toc-axolotl.monkeypatch.unsloth_" class="nav-link active" data-scroll-target="#axolotl.monkeypatch.unsloth_">monkeypatch.unsloth_</a></li>
-  </ul>
-</nav>
-    </div>
-<!-- main -->
-<main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block"></header>
-
-
-
-
-<section id="axolotl.monkeypatch.unsloth_" class="level1">
-<h1>monkeypatch.unsloth_</h1>
-<p><code>monkeypatch.unsloth_</code></p>
-<p>module for patching with unsloth optimizations</p>
-
-
-</section>
-
-</main> <!-- /main -->
-<script id="quarto-html-after-body" type="application/javascript">
-  window.document.addEventListener("DOMContentLoaded", function (event) {
-    const icon = "";
-    const anchorJS = new window.AnchorJS();
-    anchorJS.options = {
-      placement: 'right',
-      icon: icon
-    };
-    anchorJS.add('.anchored');
-    const isCodeAnnotation = (el) => {
-      for (const clz of el.classList) {
-        if (clz.startsWith('code-annotation-')) {                     
-          return true;
-        }
-      }
-      return false;
-    }
-    const onCopySuccess = function(e) {
-      // button target
-      const button = e.trigger;
-      // don't keep focus
-      button.blur();
-      // flash "checked"
-      button.classList.add('code-copy-button-checked');
-      var currentTitle = button.getAttribute("title");
-      button.setAttribute("title", "Copied!");
-      let tooltip;
-      if (window.bootstrap) {
-        button.setAttribute("data-bs-toggle", "tooltip");
-        button.setAttribute("data-bs-placement", "left");
-        button.setAttribute("data-bs-title", "Copied!");
-        tooltip = new bootstrap.Tooltip(button, 
-          { trigger: "manual", 
-            customClass: "code-copy-button-tooltip",
-            offset: [0, -8]});
-        tooltip.show();    
-      }
-      setTimeout(function() {
-        if (tooltip) {
-          tooltip.hide();
-          button.removeAttribute("data-bs-title");
-          button.removeAttribute("data-bs-toggle");
-          button.removeAttribute("data-bs-placement");
-        }
-        button.setAttribute("title", currentTitle);
-        button.classList.remove('code-copy-button-checked');
-      }, 1000);
-      // clear code selection
-      e.clearSelection();
-    }
-    const getTextToCopy = function(trigger) {
-      const outerScaffold = trigger.parentElement.cloneNode(true);
-      const codeEl = outerScaffold.querySelector('code');
-      for (const childEl of codeEl.children) {
-        if (isCodeAnnotation(childEl)) {
-          childEl.remove();
-        }
-      }
-      return codeEl.innerText;
-    }
-    const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
-      text: getTextToCopy
-    });
-    clipboard.on('success', onCopySuccess);
-    if (window.document.getElementById('quarto-embedded-source-code-modal')) {
-      const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
-        text: getTextToCopy,
-        container: window.document.getElementById('quarto-embedded-source-code-modal')
-      });
-      clipboardModal.on('success', onCopySuccess);
-    }
-      var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
-      var mailtoRegex = new RegExp(/^mailto:/);
-        var filterRegex = new RegExp("https:\/\/docs\.axolotl\.ai");
-      var isInternal = (href) => {
-          return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
-      }
-      // Inspect non-navigation links and adorn them if external
-     var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
-      for (var i=0; i<links.length; i++) {
-        const link = links[i];
-        if (!isInternal(link.href)) {
-          // undo the damage that might have been done by quarto-nav.js in the case of
-          // links that we want to consider external
-          if (link.dataset.originalHref !== undefined) {
-            link.href = link.dataset.originalHref;
-          }
-        }
-      }
-    function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
-      const config = {
-        allowHTML: true,
-        maxWidth: 500,
-        delay: 100,
-        arrow: false,
-        appendTo: function(el) {
-            return el.parentElement;
-        },
-        interactive: true,
-        interactiveBorder: 10,
-        theme: 'quarto',
-        placement: 'bottom-start',
-      };
-      if (contentFn) {
-        config.content = contentFn;
-      }
-      if (onTriggerFn) {
-        config.onTrigger = onTriggerFn;
-      }
-      if (onUntriggerFn) {
-        config.onUntrigger = onUntriggerFn;
-      }
-      window.tippy(el, config); 
-    }
-    const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
-    for (var i=0; i<noterefs.length; i++) {
-      const ref = noterefs[i];
-      tippyHover(ref, function() {
-        // use id or data attribute instead here
-        let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
-        try { href = new URL(href).hash; } catch {}
-        const id = href.replace(/^#\/?/, "");
-        const note = window.document.getElementById(id);
-        if (note) {
-          return note.innerHTML;
-        } else {
-          return "";
-        }
-      });
-    }
-    const xrefs = window.document.querySelectorAll('a.quarto-xref');
-    const processXRef = (id, note) => {
-      // Strip column container classes
-      const stripColumnClz = (el) => {
-        el.classList.remove("page-full", "page-columns");
-        if (el.children) {
-          for (const child of el.children) {
-            stripColumnClz(child);
-          }
-        }
-      }
-      stripColumnClz(note)
-      if (id === null || id.startsWith('sec-')) {
-        // Special case sections, only their first couple elements
-        const container = document.createElement("div");
-        if (note.children && note.children.length > 2) {
-          container.appendChild(note.children[0].cloneNode(true));
-          for (let i = 1; i < note.children.length; i++) {
-            const child = note.children[i];
-            if (child.tagName === "P" && child.innerText === "") {
-              continue;
-            } else {
-              container.appendChild(child.cloneNode(true));
-              break;
-            }
-          }
-          if (window.Quarto?.typesetMath) {
-            window.Quarto.typesetMath(container);
-          }
-          return container.innerHTML
-        } else {
-          if (window.Quarto?.typesetMath) {
-            window.Quarto.typesetMath(note);
-          }
-          return note.innerHTML;
-        }
-      } else {
-        // Remove any anchor links if they are present
-        const anchorLink = note.querySelector('a.anchorjs-link');
-        if (anchorLink) {
-          anchorLink.remove();
-        }
-        if (window.Quarto?.typesetMath) {
-          window.Quarto.typesetMath(note);
-        }
-        if (note.classList.contains("callout")) {
-          return note.outerHTML;
-        } else {
-          return note.innerHTML;
-        }
-      }
-    }
-    for (var i=0; i<xrefs.length; i++) {
-      const xref = xrefs[i];
-      tippyHover(xref, undefined, function(instance) {
-        instance.disable();
-        let url = xref.getAttribute('href');
-        let hash = undefined; 
-        if (url.startsWith('#')) {
-          hash = url;
-        } else {
-          try { hash = new URL(url).hash; } catch {}
-        }
-        if (hash) {
-          const id = hash.replace(/^#\/?/, "");
-          const note = window.document.getElementById(id);
-          if (note !== null) {
-            try {
-              const html = processXRef(id, note.cloneNode(true));
-              instance.setContent(html);
-            } finally {
-              instance.enable();
-              instance.show();
-            }
-          } else {
-            // See if we can fetch this
-            fetch(url.split('#')[0])
-            .then(res => res.text())
-            .then(html => {
-              const parser = new DOMParser();
-              const htmlDoc = parser.parseFromString(html, "text/html");
-              const note = htmlDoc.getElementById(id);
-              if (note !== null) {
-                const html = processXRef(id, note);
-                instance.setContent(html);
-              } 
-            }).finally(() => {
-              instance.enable();
-              instance.show();
-            });
-          }
-        } else {
-          // See if we can fetch a full url (with no hash to target)
-          // This is a special case and we should probably do some content thinning / targeting
-          fetch(url)
-          .then(res => res.text())
-          .then(html => {
-            const parser = new DOMParser();
-            const htmlDoc = parser.parseFromString(html, "text/html");
-            const note = htmlDoc.querySelector('main.content');
-            if (note !== null) {
-              // This should only happen for chapter cross references
-              // (since there is no id in the URL)
-              // remove the first header
-              if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
-                note.children[0].remove();
-              }
-              const html = processXRef(null, note);
-              instance.setContent(html);
-            } 
-          }).finally(() => {
-            instance.enable();
-            instance.show();
-          });
-        }
-      }, function(instance) {
-      });
-    }
-        let selectedAnnoteEl;
-        const selectorForAnnotation = ( cell, annotation) => {
-          let cellAttr = 'data-code-cell="' + cell + '"';
-          let lineAttr = 'data-code-annotation="' +  annotation + '"';
-          const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
-          return selector;
-        }
-        const selectCodeLines = (annoteEl) => {
-          const doc = window.document;
-          const targetCell = annoteEl.getAttribute("data-target-cell");
-          const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
-          const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
-          const lines = annoteSpan.getAttribute("data-code-lines").split(",");
-          const lineIds = lines.map((line) => {
-            return targetCell + "-" + line;
-          })
-          let top = null;
-          let height = null;
-          let parent = null;
-          if (lineIds.length > 0) {
-              //compute the position of the single el (top and bottom and make a div)
-              const el = window.document.getElementById(lineIds[0]);
-              top = el.offsetTop;
-              height = el.offsetHeight;
-              parent = el.parentElement.parentElement;
-            if (lineIds.length > 1) {
-              const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
-              const bottom = lastEl.offsetTop + lastEl.offsetHeight;
-              height = bottom - top;
-            }
-            if (top !== null && height !== null && parent !== null) {
-              // cook up a div (if necessary) and position it 
-              let div = window.document.getElementById("code-annotation-line-highlight");
-              if (div === null) {
-                div = window.document.createElement("div");
-                div.setAttribute("id", "code-annotation-line-highlight");
-                div.style.position = 'absolute';
-                parent.appendChild(div);
-              }
-              div.style.top = top - 2 + "px";
-              div.style.height = height + 4 + "px";
-              div.style.left = 0;
-              let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
-              if (gutterDiv === null) {
-                gutterDiv = window.document.createElement("div");
-                gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
-                gutterDiv.style.position = 'absolute';
-                const codeCell = window.document.getElementById(targetCell);
-                const gutter = codeCell.querySelector('.code-annotation-gutter');
-                gutter.appendChild(gutterDiv);
-              }
-              gutterDiv.style.top = top - 2 + "px";
-              gutterDiv.style.height = height + 4 + "px";
-            }
-            selectedAnnoteEl = annoteEl;
-          }
-        };
-        const unselectCodeLines = () => {
-          const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
-          elementsIds.forEach((elId) => {
-            const div = window.document.getElementById(elId);
-            if (div) {
-              div.remove();
-            }
-          });
-          selectedAnnoteEl = undefined;
-        };
-          // Handle positioning of the toggle
-      window.addEventListener(
-        "resize",
-        throttle(() => {
-          elRect = undefined;
-          if (selectedAnnoteEl) {
-            selectCodeLines(selectedAnnoteEl);
-          }
-        }, 10)
-      );
-      function throttle(fn, ms) {
-      let throttle = false;
-      let timer;
-        return (...args) => {
-          if(!throttle) { // first call gets through
-              fn.apply(this, args);
-              throttle = true;
-          } else { // all the others get throttled
-              if(timer) clearTimeout(timer); // cancel #2
-              timer = setTimeout(() => {
-                fn.apply(this, args);
-                timer = throttle = false;
-              }, ms);
-          }
-        };
-      }
-        // Attach click handler to the DT
-        const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
-        for (const annoteDlNode of annoteDls) {
-          annoteDlNode.addEventListener('click', (event) => {
-            const clickedEl = event.target;
-            if (clickedEl !== selectedAnnoteEl) {
-              unselectCodeLines();
-              const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
-              if (activeEl) {
-                activeEl.classList.remove('code-annotation-active');
-              }
-              selectCodeLines(clickedEl);
-              clickedEl.classList.add('code-annotation-active');
-            } else {
-              // Unselect the line
-              unselectCodeLines();
-              clickedEl.classList.remove('code-annotation-active');
-            }
-          });
-        }
-    const findCites = (el) => {
-      const parentEl = el.parentElement;
-      if (parentEl) {
-        const cites = parentEl.dataset.cites;
-        if (cites) {
-          return {
-            el,
-            cites: cites.split(' ')
-          };
-        } else {
-          return findCites(el.parentElement)
-        }
-      } else {
-        return undefined;
-      }
-    };
-    var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
-    for (var i=0; i<bibliorefs.length; i++) {
-      const ref = bibliorefs[i];
-      const citeInfo = findCites(ref);
-      if (citeInfo) {
-        tippyHover(citeInfo.el, function() {
-          var popup = window.document.createElement('div');
-          citeInfo.cites.forEach(function(cite) {
-            var citeDiv = window.document.createElement('div');
-            citeDiv.classList.add('hanging-indent');
-            citeDiv.classList.add('csl-entry');
-            var biblioDiv = window.document.getElementById('ref-' + cite);
-            if (biblioDiv) {
-              citeDiv.innerHTML = biblioDiv.innerHTML;
-            }
-            popup.appendChild(citeDiv);
-          });
-          return popup.innerHTML;
-        });
-      }
-    }
-  });
-  </script>
-</div> <!-- /content -->
-
-
-
-
-</body></html>
\ No newline at end of file
diff --git a/docs/api/monkeypatch.utils.html b/docs/api/monkeypatch.utils.html
index b70879e72..46e6ee3c8 100644
--- a/docs/api/monkeypatch.utils.html
+++ b/docs/api/monkeypatch.utils.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.alpaca_chat.html b/docs/api/prompt_strategies.alpaca_chat.html
index 61a241151..b72c567aa 100644
--- a/docs/api/prompt_strategies.alpaca_chat.html
+++ b/docs/api/prompt_strategies.alpaca_chat.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.alpaca_instruct.html b/docs/api/prompt_strategies.alpaca_instruct.html
index c87a6bf96..64a70b400 100644
--- a/docs/api/prompt_strategies.alpaca_instruct.html
+++ b/docs/api/prompt_strategies.alpaca_instruct.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.alpaca_w_system.html b/docs/api/prompt_strategies.alpaca_w_system.html
index 7e9eee92a..22c6cccb7 100644
--- a/docs/api/prompt_strategies.alpaca_w_system.html
+++ b/docs/api/prompt_strategies.alpaca_w_system.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.base.html b/docs/api/prompt_strategies.base.html
index ac814121a..7041883f9 100644
--- a/docs/api/prompt_strategies.base.html
+++ b/docs/api/prompt_strategies.base.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.bradley_terry.llama3.html b/docs/api/prompt_strategies.bradley_terry.llama3.html
index 51c0fbd3b..c0932768a 100644
--- a/docs/api/prompt_strategies.bradley_terry.llama3.html
+++ b/docs/api/prompt_strategies.bradley_terry.llama3.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.chat_template.html b/docs/api/prompt_strategies.chat_template.html
index 209bc339a..1c013b943 100644
--- a/docs/api/prompt_strategies.chat_template.html
+++ b/docs/api/prompt_strategies.chat_template.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.completion.html b/docs/api/prompt_strategies.completion.html
index 127409e87..7a515ea70 100644
--- a/docs/api/prompt_strategies.completion.html
+++ b/docs/api/prompt_strategies.completion.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.chat_template.html b/docs/api/prompt_strategies.dpo.chat_template.html
index 9daea51f7..4d8bfd5d9 100644
--- a/docs/api/prompt_strategies.dpo.chat_template.html
+++ b/docs/api/prompt_strategies.dpo.chat_template.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.chatml.html b/docs/api/prompt_strategies.dpo.chatml.html
index 228a241ca..14d19b781 100644
--- a/docs/api/prompt_strategies.dpo.chatml.html
+++ b/docs/api/prompt_strategies.dpo.chatml.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.llama3.html b/docs/api/prompt_strategies.dpo.llama3.html
index d258a0cc9..36ed092e7 100644
--- a/docs/api/prompt_strategies.dpo.llama3.html
+++ b/docs/api/prompt_strategies.dpo.llama3.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.passthrough.html b/docs/api/prompt_strategies.dpo.passthrough.html
index 8f7ee5273..48fc2e1a7 100644
--- a/docs/api/prompt_strategies.dpo.passthrough.html
+++ b/docs/api/prompt_strategies.dpo.passthrough.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.user_defined.html b/docs/api/prompt_strategies.dpo.user_defined.html
index cc8a87411..4074955ab 100644
--- a/docs/api/prompt_strategies.dpo.user_defined.html
+++ b/docs/api/prompt_strategies.dpo.user_defined.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.zephyr.html b/docs/api/prompt_strategies.dpo.zephyr.html
index 73ea28ef9..2db251384 100644
--- a/docs/api/prompt_strategies.dpo.zephyr.html
+++ b/docs/api/prompt_strategies.dpo.zephyr.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.input_output.html b/docs/api/prompt_strategies.input_output.html
index 3a58bb4d6..a9f0c142a 100644
--- a/docs/api/prompt_strategies.input_output.html
+++ b/docs/api/prompt_strategies.input_output.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.kto.chatml.html b/docs/api/prompt_strategies.kto.chatml.html
index 0a2252668..66c53009e 100644
--- a/docs/api/prompt_strategies.kto.chatml.html
+++ b/docs/api/prompt_strategies.kto.chatml.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.kto.llama3.html b/docs/api/prompt_strategies.kto.llama3.html
index ab4d4313e..b63464b74 100644
--- a/docs/api/prompt_strategies.kto.llama3.html
+++ b/docs/api/prompt_strategies.kto.llama3.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.kto.user_defined.html b/docs/api/prompt_strategies.kto.user_defined.html
index ed2b14f87..8effa9aa8 100644
--- a/docs/api/prompt_strategies.kto.user_defined.html
+++ b/docs/api/prompt_strategies.kto.user_defined.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.llama2_chat.html b/docs/api/prompt_strategies.llama2_chat.html
index 7647ee07c..a31d34516 100644
--- a/docs/api/prompt_strategies.llama2_chat.html
+++ b/docs/api/prompt_strategies.llama2_chat.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.messages.chat.html b/docs/api/prompt_strategies.messages.chat.html
index 00811aaad..4b0029e52 100644
--- a/docs/api/prompt_strategies.messages.chat.html
+++ b/docs/api/prompt_strategies.messages.chat.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.metharme.html b/docs/api/prompt_strategies.metharme.html
index 78c13c0c4..aa612c7e9 100644
--- a/docs/api/prompt_strategies.metharme.html
+++ b/docs/api/prompt_strategies.metharme.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.orcamini.html b/docs/api/prompt_strategies.orcamini.html
index 4dd22d983..e6a2f3a9c 100644
--- a/docs/api/prompt_strategies.orcamini.html
+++ b/docs/api/prompt_strategies.orcamini.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.orpo.chat_template.html b/docs/api/prompt_strategies.orpo.chat_template.html
index 3355711fb..fa4cf939c 100644
--- a/docs/api/prompt_strategies.orpo.chat_template.html
+++ b/docs/api/prompt_strategies.orpo.chat_template.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.pygmalion.html b/docs/api/prompt_strategies.pygmalion.html
index 4a3bc73b2..05bd0a187 100644
--- a/docs/api/prompt_strategies.pygmalion.html
+++ b/docs/api/prompt_strategies.pygmalion.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.stepwise_supervised.html b/docs/api/prompt_strategies.stepwise_supervised.html
index 97daea234..d8e747e03 100644
--- a/docs/api/prompt_strategies.stepwise_supervised.html
+++ b/docs/api/prompt_strategies.stepwise_supervised.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.user_defined.html b/docs/api/prompt_strategies.user_defined.html
index 46afe34a8..6220251e7 100644
--- a/docs/api/prompt_strategies.user_defined.html
+++ b/docs/api/prompt_strategies.user_defined.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_tokenizers.html b/docs/api/prompt_tokenizers.html
index 8c62d9675..c1a6041aa 100644
--- a/docs/api/prompt_tokenizers.html
+++ b/docs/api/prompt_tokenizers.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/train.html b/docs/api/train.html
index d785a92fe..5635662a2 100644
--- a/docs/api/train.html
+++ b/docs/api/train.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.bench.html b/docs/api/utils.bench.html
index c30ecaf51..e7b6ab70b 100644
--- a/docs/api/utils.bench.html
+++ b/docs/api/utils.bench.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.comet_.html b/docs/api/utils.callbacks.comet_.html
index c6f602624..e2ee816f5 100644
--- a/docs/api/utils.callbacks.comet_.html
+++ b/docs/api/utils.callbacks.comet_.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.lisa.html b/docs/api/utils.callbacks.lisa.html
index bb40e4071..cf3c11a97 100644
--- a/docs/api/utils.callbacks.lisa.html
+++ b/docs/api/utils.callbacks.lisa.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.mlflow_.html b/docs/api/utils.callbacks.mlflow_.html
index f4d767948..254401142 100644
--- a/docs/api/utils.callbacks.mlflow_.html
+++ b/docs/api/utils.callbacks.mlflow_.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.perplexity.html b/docs/api/utils.callbacks.perplexity.html
index bdd380ebf..557f64396 100644
--- a/docs/api/utils.callbacks.perplexity.html
+++ b/docs/api/utils.callbacks.perplexity.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.profiler.html b/docs/api/utils.callbacks.profiler.html
index b69afaea5..58f2d8313 100644
--- a/docs/api/utils.callbacks.profiler.html
+++ b/docs/api/utils.callbacks.profiler.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.qat.html b/docs/api/utils.callbacks.qat.html
index 885b9ae31..75d425301 100644
--- a/docs/api/utils.callbacks.qat.html
+++ b/docs/api/utils.callbacks.qat.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.chat_templates.html b/docs/api/utils.chat_templates.html
index 20e01791d..f5ae5090a 100644
--- a/docs/api/utils.chat_templates.html
+++ b/docs/api/utils.chat_templates.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.collators.batching.html b/docs/api/utils.collators.batching.html
index 22f974ac3..c2f666eef 100644
--- a/docs/api/utils.collators.batching.html
+++ b/docs/api/utils.collators.batching.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.collators.core.html b/docs/api/utils.collators.core.html
index 69b224a10..897b9f699 100644
--- a/docs/api/utils.collators.core.html
+++ b/docs/api/utils.collators.core.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.collators.mamba.html b/docs/api/utils.collators.mamba.html
index 48e0cbd8b..83944f998 100644
--- a/docs/api/utils.collators.mamba.html
+++ b/docs/api/utils.collators.mamba.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.collators.mm_chat.html b/docs/api/utils.collators.mm_chat.html
index fd2f39b31..c18253c40 100644
--- a/docs/api/utils.collators.mm_chat.html
+++ b/docs/api/utils.collators.mm_chat.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.ctx_managers.sequence_parallel.html b/docs/api/utils.ctx_managers.sequence_parallel.html
index 5d9263f99..add8dcd98 100644
--- a/docs/api/utils.ctx_managers.sequence_parallel.html
+++ b/docs/api/utils.ctx_managers.sequence_parallel.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.data.sft.html b/docs/api/utils.data.sft.html
index 97b4acb48..150d91487 100644
--- a/docs/api/utils.data.sft.html
+++ b/docs/api/utils.data.sft.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.data.streaming.html b/docs/api/utils.data.streaming.html
index 6b05e9b5c..ae6b42fd8 100644
--- a/docs/api/utils.data.streaming.html
+++ b/docs/api/utils.data.streaming.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.dict.html b/docs/api/utils.dict.html
index caf8d3229..0dff30abd 100644
--- a/docs/api/utils.dict.html
+++ b/docs/api/utils.dict.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.distributed.html b/docs/api/utils.distributed.html
index 60d38b7d3..a984482d1 100644
--- a/docs/api/utils.distributed.html
+++ b/docs/api/utils.distributed.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.freeze.html b/docs/api/utils.freeze.html
index 256970e4a..a2dd93523 100644
--- a/docs/api/utils.freeze.html
+++ b/docs/api/utils.freeze.html
@@ -725,12 +725,6 @@ window.Quarto = {
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.lora.html b/docs/api/utils.lora.html
index 818affacd..471d8113b 100644
--- a/docs/api/utils.lora.html
+++ b/docs/api/utils.lora.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.model_shard_quant.html b/docs/api/utils.model_shard_quant.html
index de0c7fb50..d5146dba9 100644
--- a/docs/api/utils.model_shard_quant.html
+++ b/docs/api/utils.model_shard_quant.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.optimizers.adopt.html b/docs/api/utils.optimizers.adopt.html
index 2dbb0928c..d7236ce1d 100644
--- a/docs/api/utils.optimizers.adopt.html
+++ b/docs/api/utils.optimizers.adopt.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.quantization.html b/docs/api/utils.quantization.html
index 23093e374..b04223268 100644
--- a/docs/api/utils.quantization.html
+++ b/docs/api/utils.quantization.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.samplers.multipack.html b/docs/api/utils.samplers.multipack.html
index b6fec1508..f0a2583b4 100644
--- a/docs/api/utils.samplers.multipack.html
+++ b/docs/api/utils.samplers.multipack.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schedulers.html b/docs/api/utils.schedulers.html
index 5f316f31f..d69128171 100644
--- a/docs/api/utils.schedulers.html
+++ b/docs/api/utils.schedulers.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.config.html b/docs/api/utils.schemas.config.html
index 2bad2e5a6..ab05dd163 100644
--- a/docs/api/utils.schemas.config.html
+++ b/docs/api/utils.schemas.config.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.datasets.html b/docs/api/utils.schemas.datasets.html
index ad46eae8d..fe0a79573 100644
--- a/docs/api/utils.schemas.datasets.html
+++ b/docs/api/utils.schemas.datasets.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.enums.html b/docs/api/utils.schemas.enums.html
index 7732e77d4..cf574f4fa 100644
--- a/docs/api/utils.schemas.enums.html
+++ b/docs/api/utils.schemas.enums.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.integrations.html b/docs/api/utils.schemas.integrations.html
index 56871e2dc..8a7fdeea2 100644
--- a/docs/api/utils.schemas.integrations.html
+++ b/docs/api/utils.schemas.integrations.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.model.html b/docs/api/utils.schemas.model.html
index 480586a36..559ebba9e 100644
--- a/docs/api/utils.schemas.model.html
+++ b/docs/api/utils.schemas.model.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.multimodal.html b/docs/api/utils.schemas.multimodal.html
index 64e0541f9..5df7bc8cd 100644
--- a/docs/api/utils.schemas.multimodal.html
+++ b/docs/api/utils.schemas.multimodal.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.peft.html b/docs/api/utils.schemas.peft.html
index 89bb07a90..4c4e236ea 100644
--- a/docs/api/utils.schemas.peft.html
+++ b/docs/api/utils.schemas.peft.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.training.html b/docs/api/utils.schemas.training.html
index 876d09db5..a988aa328 100644
--- a/docs/api/utils.schemas.training.html
+++ b/docs/api/utils.schemas.training.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.trl.html b/docs/api/utils.schemas.trl.html
index f0d2c95d1..6d020c01a 100644
--- a/docs/api/utils.schemas.trl.html
+++ b/docs/api/utils.schemas.trl.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.utils.html b/docs/api/utils.schemas.utils.html
index fe0f9f841..3ca435063 100644
--- a/docs/api/utils.schemas.utils.html
+++ b/docs/api/utils.schemas.utils.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.tokenization.html b/docs/api/utils.tokenization.html
index 41b7e2aad..2dbda523d 100644
--- a/docs/api/utils.tokenization.html
+++ b/docs/api/utils.tokenization.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.trainer.html b/docs/api/utils.trainer.html
index 5c7bf728f..b08afd351 100644
--- a/docs/api/utils.trainer.html
+++ b/docs/api/utils.trainer.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/attention.html b/docs/attention.html
index 56d0083f3..9029e1c24 100644
--- a/docs/attention.html
+++ b/docs/attention.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/batch_vs_grad.html b/docs/batch_vs_grad.html
index 40c90fbc4..7822d1be5 100644
--- a/docs/batch_vs_grad.html
+++ b/docs/batch_vs_grad.html
@@ -662,12 +662,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/checkpoint_saving.html b/docs/checkpoint_saving.html
index 205c5505a..8c5f23ce2 100644
--- a/docs/checkpoint_saving.html
+++ b/docs/checkpoint_saving.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/choosing_method.html b/docs/choosing_method.html
index 05b0e3bf2..22f2bf61a 100644
--- a/docs/choosing_method.html
+++ b/docs/choosing_method.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/cli.html b/docs/cli.html
index 54f283000..ba7912134 100644
--- a/docs/cli.html
+++ b/docs/cli.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/config-reference.html b/docs/config-reference.html
index 82f18d9ef..2c637fda7 100644
--- a/docs/config-reference.html
+++ b/docs/config-reference.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -1817,637 +1811,630 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb1-1003"><a href="#cb1-1003" aria-hidden="true" tabindex="-1"></a><span class="co"># length generalization.</span></span>
 <span id="cb1-1004"><a href="#cb1-1004" aria-hidden="true" tabindex="-1"></a><span class="fu">scaling_softmax_bias</span><span class="kw">:</span><span class="at"> float | None</span></span>
 <span id="cb1-1005"><a href="#cb1-1005" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1006"><a href="#cb1-1006" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_cross_entropy_loss</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1007"><a href="#cb1-1007" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1008"><a href="#cb1-1008" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_qkv</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1009"><a href="#cb1-1009" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_o</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1010"><a href="#cb1-1010" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rms_norm</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1011"><a href="#cb1-1011" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rope</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1012"><a href="#cb1-1012" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1013"><a href="#cb1-1013" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-1014"><a href="#cb1-1014" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-1015"><a href="#cb1-1015" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_mlp_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1016"><a href="#cb1-1016" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-1017"><a href="#cb1-1017" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-1018"><a href="#cb1-1018" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_qkv_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1019"><a href="#cb1-1019" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-1020"><a href="#cb1-1020" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-1021"><a href="#cb1-1021" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_o_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1022"><a href="#cb1-1022" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd function for embedding layers. See:</span></span>
-<span id="cb1-1023"><a href="#cb1-1023" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-1024"><a href="#cb1-1024" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_embedding_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1025"><a href="#cb1-1025" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1026"><a href="#cb1-1026" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use chunked cross entropy loss for memory efficiency</span></span>
-<span id="cb1-1027"><a href="#cb1-1027" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1028"><a href="#cb1-1028" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of chunks to use for chunked cross entropy loss</span></span>
-<span id="cb1-1029"><a href="#cb1-1029" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1030"><a href="#cb1-1030" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable Entropy-Aware Focal Training loss (EAFT)</span></span>
-<span id="cb1-1031"><a href="#cb1-1031" aria-hidden="true" tabindex="-1"></a><span class="fu">use_eaft</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1032"><a href="#cb1-1032" aria-hidden="true" tabindex="-1"></a><span class="co"># Exponent for entropy weighting in EAFT (default: 1.0)</span></span>
-<span id="cb1-1033"><a href="#cb1-1033" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_alpha</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
-<span id="cb1-1034"><a href="#cb1-1034" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of top logits for entropy approximation (default: 20)</span></span>
-<span id="cb1-1035"><a href="#cb1-1035" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_k</span><span class="kw">:</span><span class="at"> int | None = 20</span></span>
+<span id="cb1-1006"><a href="#cb1-1006" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-1007"><a href="#cb1-1007" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-1008"><a href="#cb1-1008" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_mlp_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1009"><a href="#cb1-1009" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-1010"><a href="#cb1-1010" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-1011"><a href="#cb1-1011" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_qkv_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1012"><a href="#cb1-1012" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-1013"><a href="#cb1-1013" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-1014"><a href="#cb1-1014" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_o_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1015"><a href="#cb1-1015" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd function for embedding layers. See:</span></span>
+<span id="cb1-1016"><a href="#cb1-1016" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-1017"><a href="#cb1-1017" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_embedding_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1018"><a href="#cb1-1018" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1019"><a href="#cb1-1019" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use chunked cross entropy loss for memory efficiency</span></span>
+<span id="cb1-1020"><a href="#cb1-1020" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1021"><a href="#cb1-1021" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of chunks to use for chunked cross entropy loss</span></span>
+<span id="cb1-1022"><a href="#cb1-1022" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1023"><a href="#cb1-1023" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable Entropy-Aware Focal Training loss (EAFT)</span></span>
+<span id="cb1-1024"><a href="#cb1-1024" aria-hidden="true" tabindex="-1"></a><span class="fu">use_eaft</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1025"><a href="#cb1-1025" aria-hidden="true" tabindex="-1"></a><span class="co"># Exponent for entropy weighting in EAFT (default: 1.0)</span></span>
+<span id="cb1-1026"><a href="#cb1-1026" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_alpha</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
+<span id="cb1-1027"><a href="#cb1-1027" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of top logits for entropy approximation (default: 20)</span></span>
+<span id="cb1-1028"><a href="#cb1-1028" aria-hidden="true" tabindex="-1"></a><span class="fu">eaft_k</span><span class="kw">:</span><span class="at"> int | None = 20</span></span>
+<span id="cb1-1029"><a href="#cb1-1029" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1030"><a href="#cb1-1030" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ALST tiled mlp for memory efficient long context</span></span>
+<span id="cb1-1031"><a href="#cb1-1031" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1032"><a href="#cb1-1032" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1033"><a href="#cb1-1033" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of shards to use for ALST tiled mlp. If unset, it will be set based on</span></span>
+<span id="cb1-1034"><a href="#cb1-1034" aria-hidden="true" tabindex="-1"></a><span class="co"># seqlen/hidden_size</span></span>
+<span id="cb1-1035"><a href="#cb1-1035" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_num_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
 <span id="cb1-1036"><a href="#cb1-1036" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1037"><a href="#cb1-1037" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ALST tiled mlp for memory efficient long context</span></span>
-<span id="cb1-1038"><a href="#cb1-1038" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1039"><a href="#cb1-1039" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1040"><a href="#cb1-1040" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of shards to use for ALST tiled mlp. If unset, it will be set based on</span></span>
-<span id="cb1-1041"><a href="#cb1-1041" aria-hidden="true" tabindex="-1"></a><span class="co"># seqlen/hidden_size</span></span>
-<span id="cb1-1042"><a href="#cb1-1042" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_num_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1043"><a href="#cb1-1043" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1044"><a href="#cb1-1044" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on</span></span>
-<span id="cb1-1045"><a href="#cb1-1045" aria-hidden="true" tabindex="-1"></a><span class="co"># llama.</span></span>
-<span id="cb1-1046"><a href="#cb1-1046" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_use_original_mlp</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1047"><a href="#cb1-1047" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1048"><a href="#cb1-1048" aria-hidden="true" tabindex="-1"></a><span class="fu">llama4_linearized_experts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1037"><a href="#cb1-1037" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on</span></span>
+<span id="cb1-1038"><a href="#cb1-1038" aria-hidden="true" tabindex="-1"></a><span class="co"># llama.</span></span>
+<span id="cb1-1039"><a href="#cb1-1039" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_use_original_mlp</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1040"><a href="#cb1-1040" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1041"><a href="#cb1-1041" aria-hidden="true" tabindex="-1"></a><span class="fu">llama4_linearized_experts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1042"><a href="#cb1-1042" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1043"><a href="#cb1-1043" aria-hidden="true" tabindex="-1"></a><span class="co"># Deepspeed config path. e.g., deepspeed_configs/zero3.json</span></span>
+<span id="cb1-1044"><a href="#cb1-1044" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
+<span id="cb1-1045"><a href="#cb1-1045" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use deepcompile for faster training with deepspeed</span></span>
+<span id="cb1-1046"><a href="#cb1-1046" aria-hidden="true" tabindex="-1"></a><span class="fu">deepcompile</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1047"><a href="#cb1-1047" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration</span></span>
+<span id="cb1-1048"><a href="#cb1-1048" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
 <span id="cb1-1049"><a href="#cb1-1049" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1050"><a href="#cb1-1050" aria-hidden="true" tabindex="-1"></a><span class="co"># Deepspeed config path. e.g., deepspeed_configs/zero3.json</span></span>
-<span id="cb1-1051"><a href="#cb1-1051" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
-<span id="cb1-1052"><a href="#cb1-1052" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use deepcompile for faster training with deepspeed</span></span>
-<span id="cb1-1053"><a href="#cb1-1053" aria-hidden="true" tabindex="-1"></a><span class="fu">deepcompile</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1054"><a href="#cb1-1054" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration</span></span>
-<span id="cb1-1055"><a href="#cb1-1055" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1056"><a href="#cb1-1056" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1057"><a href="#cb1-1057" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration options</span></span>
-<span id="cb1-1058"><a href="#cb1-1058" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span><span class="at"> FSDPConfig | None</span></span>
-<span id="cb1-1059"><a href="#cb1-1059" aria-hidden="true" tabindex="-1"></a><span class="co">  # For FSDPConfig:</span></span>
-<span id="cb1-1060"><a href="#cb1-1060" aria-hidden="true" tabindex="-1"></a><span class="co">  # FSDP version</span></span>
-<span id="cb1-1061"><a href="#cb1-1061" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1062"><a href="#cb1-1062" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable activation checkpointing to reduce memory usage during forward passes</span></span>
-<span id="cb1-1063"><a href="#cb1-1063" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">activation_checkpointing</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1064"><a href="#cb1-1064" aria-hidden="true" tabindex="-1"></a><span class="co">  # Offload parameters to CPU to reduce GPU memory usage</span></span>
-<span id="cb1-1065"><a href="#cb1-1065" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">offload_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1066"><a href="#cb1-1066" aria-hidden="true" tabindex="-1"></a><span class="co">  # Synchronize module states across all processes</span></span>
-<span id="cb1-1067"><a href="#cb1-1067" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">sync_module_states</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1068"><a href="#cb1-1068" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable CPU RAM efficient loading to reduce memory usage during model loading</span></span>
-<span id="cb1-1069"><a href="#cb1-1069" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1070"><a href="#cb1-1070" aria-hidden="true" tabindex="-1"></a><span class="co">  # Disabling this enables swap memory usage for resource-constrained setups when</span></span>
-<span id="cb1-1071"><a href="#cb1-1071" aria-hidden="true" tabindex="-1"></a><span class="co">  # offload_params is enabled.</span></span>
-<span id="cb1-1072"><a href="#cb1-1072" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_offload_pin_memory</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1073"><a href="#cb1-1073" aria-hidden="true" tabindex="-1"></a><span class="co">  # Use original parameters instead of flattened parameters</span></span>
-<span id="cb1-1074"><a href="#cb1-1074" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">use_orig_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1075"><a href="#cb1-1075" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1076"><a href="#cb1-1076" aria-hidden="true" tabindex="-1"></a><span class="co">  # Type of state dict to use for saving/loading checkpoints</span></span>
-<span id="cb1-1077"><a href="#cb1-1077" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
-<span id="cb1-1078"><a href="#cb1-1078" aria-hidden="true" tabindex="-1"></a><span class="co">  # Final state dict type to use after training completion</span></span>
-<span id="cb1-1079"><a href="#cb1-1079" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
-<span id="cb1-1080"><a href="#cb1-1080" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1081"><a href="#cb1-1081" aria-hidden="true" tabindex="-1"></a><span class="co">  # Policy for automatically wrapping modules with FSDP</span></span>
-<span id="cb1-1082"><a href="#cb1-1082" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">auto_wrap_policy</span><span class="kw">:</span><span class="at"> Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None</span></span>
-<span id="cb1-1083"><a href="#cb1-1083" aria-hidden="true" tabindex="-1"></a><span class="co">  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')</span></span>
-<span id="cb1-1084"><a href="#cb1-1084" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1085"><a href="#cb1-1085" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1086"><a href="#cb1-1086" aria-hidden="true" tabindex="-1"></a><span class="co">  # Reshard parameters after forward pass to save memory</span></span>
-<span id="cb1-1087"><a href="#cb1-1087" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">reshard_after_forward</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1088"><a href="#cb1-1088" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')</span></span>
-<span id="cb1-1089"><a href="#cb1-1089" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">mixed_precision_policy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1090"><a href="#cb1-1090" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1091"><a href="#cb1-1091" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP version</span></span>
-<span id="cb1-1092"><a href="#cb1-1092" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1093"><a href="#cb1-1093" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
-<span id="cb1-1094"><a href="#cb1-1094" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1095"><a href="#cb1-1095" aria-hidden="true" tabindex="-1"></a><span class="co"># How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for</span></span>
-<span id="cb1-1096"><a href="#cb1-1096" aria-hidden="true" tabindex="-1"></a><span class="co"># no eval.</span></span>
-<span id="cb1-1097"><a href="#cb1-1097" aria-hidden="true" tabindex="-1"></a><span class="fu">val_set_size</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-1098"><a href="#cb1-1098" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1099"><a href="#cb1-1099" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to shard across. If not set, will use all available devices.</span></span>
-<span id="cb1-1100"><a href="#cb1-1100" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_shard_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1101"><a href="#cb1-1101" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to replicate across.</span></span>
-<span id="cb1-1102"><a href="#cb1-1102" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_replicate_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1103"><a href="#cb1-1103" aria-hidden="true" tabindex="-1"></a><span class="co"># Deprecated: use `context_parallel_size` instead</span></span>
-<span id="cb1-1104"><a href="#cb1-1104" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_parallel_degree</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1105"><a href="#cb1-1105" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to a divisor of the number of GPUs available to split sequences into chunks of</span></span>
-<span id="cb1-1106"><a href="#cb1-1106" aria-hidden="true" tabindex="-1"></a><span class="co"># equal size. Use in long context training to prevent OOM when sequences cannot fit into</span></span>
-<span id="cb1-1107"><a href="#cb1-1107" aria-hidden="true" tabindex="-1"></a><span class="co"># a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each</span></span>
-<span id="cb1-1108"><a href="#cb1-1108" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized</span></span>
-<span id="cb1-1109"><a href="#cb1-1109" aria-hidden="true" tabindex="-1"></a><span class="co"># subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more</span></span>
-<span id="cb1-1110"><a href="#cb1-1110" aria-hidden="true" tabindex="-1"></a><span class="co"># details.</span></span>
-<span id="cb1-1111"><a href="#cb1-1111" aria-hidden="true" tabindex="-1"></a><span class="fu">context_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1112"><a href="#cb1-1112" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional; strides across the key dimension. Larger values use more memory but should</span></span>
-<span id="cb1-1113"><a href="#cb1-1113" aria-hidden="true" tabindex="-1"></a><span class="co"># make training faster. Must evenly divide the number of KV heads in your model.</span></span>
-<span id="cb1-1114"><a href="#cb1-1114" aria-hidden="true" tabindex="-1"></a><span class="fu">heads_k_stride</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1115"><a href="#cb1-1115" aria-hidden="true" tabindex="-1"></a><span class="co"># One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to</span></span>
-<span id="cb1-1116"><a href="#cb1-1116" aria-hidden="true" tabindex="-1"></a><span class="co"># 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing</span></span>
-<span id="cb1-1117"><a href="#cb1-1117" aria-hidden="true" tabindex="-1"></a><span class="co"># case.</span></span>
-<span id="cb1-1118"><a href="#cb1-1118" aria-hidden="true" tabindex="-1"></a><span class="fu">ring_attn_func</span><span class="kw">:</span><span class="at"> RingAttnFunc | None</span></span>
-<span id="cb1-1119"><a href="#cb1-1119" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.</span></span>
-<span id="cb1-1120"><a href="#cb1-1120" aria-hidden="true" tabindex="-1"></a><span class="fu">tensor_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1121"><a href="#cb1-1121" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1122"><a href="#cb1-1122" aria-hidden="true" tabindex="-1"></a><span class="co"># Add or change special tokens. If you add tokens here, you don't need to add them to</span></span>
-<span id="cb1-1123"><a href="#cb1-1123" aria-hidden="true" tabindex="-1"></a><span class="co"># the `tokens` list.</span></span>
-<span id="cb1-1124"><a href="#cb1-1124" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span><span class="at"> SpecialTokensConfig | None</span></span>
-<span id="cb1-1125"><a href="#cb1-1125" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SpecialTokensConfig:</span></span>
-<span id="cb1-1126"><a href="#cb1-1126" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">bos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1127"><a href="#cb1-1127" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1128"><a href="#cb1-1128" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pad_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1129"><a href="#cb1-1129" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">unk_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1130"><a href="#cb1-1130" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">additional_special_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1050"><a href="#cb1-1050" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration options</span></span>
+<span id="cb1-1051"><a href="#cb1-1051" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span><span class="at"> FSDPConfig | None</span></span>
+<span id="cb1-1052"><a href="#cb1-1052" aria-hidden="true" tabindex="-1"></a><span class="co">  # For FSDPConfig:</span></span>
+<span id="cb1-1053"><a href="#cb1-1053" aria-hidden="true" tabindex="-1"></a><span class="co">  # FSDP version</span></span>
+<span id="cb1-1054"><a href="#cb1-1054" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1055"><a href="#cb1-1055" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable activation checkpointing to reduce memory usage during forward passes</span></span>
+<span id="cb1-1056"><a href="#cb1-1056" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">activation_checkpointing</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1057"><a href="#cb1-1057" aria-hidden="true" tabindex="-1"></a><span class="co">  # Offload parameters to CPU to reduce GPU memory usage</span></span>
+<span id="cb1-1058"><a href="#cb1-1058" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">offload_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1059"><a href="#cb1-1059" aria-hidden="true" tabindex="-1"></a><span class="co">  # Synchronize module states across all processes</span></span>
+<span id="cb1-1060"><a href="#cb1-1060" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">sync_module_states</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1061"><a href="#cb1-1061" aria-hidden="true" tabindex="-1"></a><span class="co">  # Enable CPU RAM efficient loading to reduce memory usage during model loading</span></span>
+<span id="cb1-1062"><a href="#cb1-1062" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1063"><a href="#cb1-1063" aria-hidden="true" tabindex="-1"></a><span class="co">  # Disabling this enables swap memory usage for resource-constrained setups when</span></span>
+<span id="cb1-1064"><a href="#cb1-1064" aria-hidden="true" tabindex="-1"></a><span class="co">  # offload_params is enabled.</span></span>
+<span id="cb1-1065"><a href="#cb1-1065" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">cpu_offload_pin_memory</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1066"><a href="#cb1-1066" aria-hidden="true" tabindex="-1"></a><span class="co">  # Use original parameters instead of flattened parameters</span></span>
+<span id="cb1-1067"><a href="#cb1-1067" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">use_orig_params</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1068"><a href="#cb1-1068" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1069"><a href="#cb1-1069" aria-hidden="true" tabindex="-1"></a><span class="co">  # Type of state dict to use for saving/loading checkpoints</span></span>
+<span id="cb1-1070"><a href="#cb1-1070" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
+<span id="cb1-1071"><a href="#cb1-1071" aria-hidden="true" tabindex="-1"></a><span class="co">  # Final state dict type to use after training completion</span></span>
+<span id="cb1-1072"><a href="#cb1-1072" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
+<span id="cb1-1073"><a href="#cb1-1073" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1074"><a href="#cb1-1074" aria-hidden="true" tabindex="-1"></a><span class="co">  # Policy for automatically wrapping modules with FSDP</span></span>
+<span id="cb1-1075"><a href="#cb1-1075" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">auto_wrap_policy</span><span class="kw">:</span><span class="at"> Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None</span></span>
+<span id="cb1-1076"><a href="#cb1-1076" aria-hidden="true" tabindex="-1"></a><span class="co">  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')</span></span>
+<span id="cb1-1077"><a href="#cb1-1077" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">transformer_layer_cls_to_wrap</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1078"><a href="#cb1-1078" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1079"><a href="#cb1-1079" aria-hidden="true" tabindex="-1"></a><span class="co">  # Reshard parameters after forward pass to save memory</span></span>
+<span id="cb1-1080"><a href="#cb1-1080" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">reshard_after_forward</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1081"><a href="#cb1-1081" aria-hidden="true" tabindex="-1"></a><span class="co">  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')</span></span>
+<span id="cb1-1082"><a href="#cb1-1082" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">mixed_precision_policy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1083"><a href="#cb1-1083" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1084"><a href="#cb1-1084" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP version</span></span>
+<span id="cb1-1085"><a href="#cb1-1085" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1086"><a href="#cb1-1086" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
+<span id="cb1-1087"><a href="#cb1-1087" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1088"><a href="#cb1-1088" aria-hidden="true" tabindex="-1"></a><span class="co"># How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for</span></span>
+<span id="cb1-1089"><a href="#cb1-1089" aria-hidden="true" tabindex="-1"></a><span class="co"># no eval.</span></span>
+<span id="cb1-1090"><a href="#cb1-1090" aria-hidden="true" tabindex="-1"></a><span class="fu">val_set_size</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-1091"><a href="#cb1-1091" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1092"><a href="#cb1-1092" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to shard across. If not set, will use all available devices.</span></span>
+<span id="cb1-1093"><a href="#cb1-1093" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_shard_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1094"><a href="#cb1-1094" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to replicate across.</span></span>
+<span id="cb1-1095"><a href="#cb1-1095" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_replicate_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1096"><a href="#cb1-1096" aria-hidden="true" tabindex="-1"></a><span class="co"># Deprecated: use `context_parallel_size` instead</span></span>
+<span id="cb1-1097"><a href="#cb1-1097" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_parallel_degree</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1098"><a href="#cb1-1098" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to a divisor of the number of GPUs available to split sequences into chunks of</span></span>
+<span id="cb1-1099"><a href="#cb1-1099" aria-hidden="true" tabindex="-1"></a><span class="co"># equal size. Use in long context training to prevent OOM when sequences cannot fit into</span></span>
+<span id="cb1-1100"><a href="#cb1-1100" aria-hidden="true" tabindex="-1"></a><span class="co"># a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each</span></span>
+<span id="cb1-1101"><a href="#cb1-1101" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized</span></span>
+<span id="cb1-1102"><a href="#cb1-1102" aria-hidden="true" tabindex="-1"></a><span class="co"># subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more</span></span>
+<span id="cb1-1103"><a href="#cb1-1103" aria-hidden="true" tabindex="-1"></a><span class="co"># details.</span></span>
+<span id="cb1-1104"><a href="#cb1-1104" aria-hidden="true" tabindex="-1"></a><span class="fu">context_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1105"><a href="#cb1-1105" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional; strides across the key dimension. Larger values use more memory but should</span></span>
+<span id="cb1-1106"><a href="#cb1-1106" aria-hidden="true" tabindex="-1"></a><span class="co"># make training faster. Must evenly divide the number of KV heads in your model.</span></span>
+<span id="cb1-1107"><a href="#cb1-1107" aria-hidden="true" tabindex="-1"></a><span class="fu">heads_k_stride</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1108"><a href="#cb1-1108" aria-hidden="true" tabindex="-1"></a><span class="co"># One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to</span></span>
+<span id="cb1-1109"><a href="#cb1-1109" aria-hidden="true" tabindex="-1"></a><span class="co"># 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing</span></span>
+<span id="cb1-1110"><a href="#cb1-1110" aria-hidden="true" tabindex="-1"></a><span class="co"># case.</span></span>
+<span id="cb1-1111"><a href="#cb1-1111" aria-hidden="true" tabindex="-1"></a><span class="fu">ring_attn_func</span><span class="kw">:</span><span class="at"> RingAttnFunc | None</span></span>
+<span id="cb1-1112"><a href="#cb1-1112" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.</span></span>
+<span id="cb1-1113"><a href="#cb1-1113" aria-hidden="true" tabindex="-1"></a><span class="fu">tensor_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1114"><a href="#cb1-1114" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1115"><a href="#cb1-1115" aria-hidden="true" tabindex="-1"></a><span class="co"># Add or change special tokens. If you add tokens here, you don't need to add them to</span></span>
+<span id="cb1-1116"><a href="#cb1-1116" aria-hidden="true" tabindex="-1"></a><span class="co"># the `tokens` list.</span></span>
+<span id="cb1-1117"><a href="#cb1-1117" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span><span class="at"> SpecialTokensConfig | None</span></span>
+<span id="cb1-1118"><a href="#cb1-1118" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SpecialTokensConfig:</span></span>
+<span id="cb1-1119"><a href="#cb1-1119" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">bos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1120"><a href="#cb1-1120" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1121"><a href="#cb1-1121" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pad_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1122"><a href="#cb1-1122" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">unk_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1123"><a href="#cb1-1123" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">additional_special_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1124"><a href="#cb1-1124" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1125"><a href="#cb1-1125" aria-hidden="true" tabindex="-1"></a><span class="co"># Add extra tokens to the tokenizer</span></span>
+<span id="cb1-1126"><a href="#cb1-1126" aria-hidden="true" tabindex="-1"></a><span class="fu">tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1127"><a href="#cb1-1127" aria-hidden="true" tabindex="-1"></a><span class="co"># Mapping token_id to new_token_string to override reserved added_tokens in the</span></span>
+<span id="cb1-1128"><a href="#cb1-1128" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer. Only works for tokens that are not part of the base vocab (aka are</span></span>
+<span id="cb1-1129"><a href="#cb1-1129" aria-hidden="true" tabindex="-1"></a><span class="co"># added_tokens). Can be checked if they exist in tokenizer.json added_tokens.</span></span>
+<span id="cb1-1130"><a href="#cb1-1130" aria-hidden="true" tabindex="-1"></a><span class="fu">added_tokens_overrides</span><span class="kw">:</span><span class="at"> dict[int, str] | None</span></span>
 <span id="cb1-1131"><a href="#cb1-1131" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1132"><a href="#cb1-1132" aria-hidden="true" tabindex="-1"></a><span class="co"># Add extra tokens to the tokenizer</span></span>
-<span id="cb1-1133"><a href="#cb1-1133" aria-hidden="true" tabindex="-1"></a><span class="fu">tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1134"><a href="#cb1-1134" aria-hidden="true" tabindex="-1"></a><span class="co"># Mapping token_id to new_token_string to override reserved added_tokens in the</span></span>
-<span id="cb1-1135"><a href="#cb1-1135" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer. Only works for tokens that are not part of the base vocab (aka are</span></span>
-<span id="cb1-1136"><a href="#cb1-1136" aria-hidden="true" tabindex="-1"></a><span class="co"># added_tokens). Can be checked if they exist in tokenizer.json added_tokens.</span></span>
-<span id="cb1-1137"><a href="#cb1-1137" aria-hidden="true" tabindex="-1"></a><span class="fu">added_tokens_overrides</span><span class="kw">:</span><span class="at"> dict[int, str] | None</span></span>
+<span id="cb1-1132"><a href="#cb1-1132" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use torch.compile and which backend to use. setting to `auto` will enable</span></span>
+<span id="cb1-1133"><a href="#cb1-1133" aria-hidden="true" tabindex="-1"></a><span class="co"># torch compile when torch&gt;=2.6.0</span></span>
+<span id="cb1-1134"><a href="#cb1-1134" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
+<span id="cb1-1135"><a href="#cb1-1135" aria-hidden="true" tabindex="-1"></a><span class="co"># Backend to use for torch.compile</span></span>
+<span id="cb1-1136"><a href="#cb1-1136" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_backend</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1137"><a href="#cb1-1137" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_mode</span><span class="kw">:</span><span class="at"> Literal['default', 'reduce-overhead', 'max-autotune'] | None</span></span>
 <span id="cb1-1138"><a href="#cb1-1138" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1139"><a href="#cb1-1139" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use torch.compile and which backend to use. setting to `auto` will enable</span></span>
-<span id="cb1-1140"><a href="#cb1-1140" aria-hidden="true" tabindex="-1"></a><span class="co"># torch compile when torch&gt;=2.6.0</span></span>
-<span id="cb1-1141"><a href="#cb1-1141" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
-<span id="cb1-1142"><a href="#cb1-1142" aria-hidden="true" tabindex="-1"></a><span class="co"># Backend to use for torch.compile</span></span>
-<span id="cb1-1143"><a href="#cb1-1143" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_backend</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1144"><a href="#cb1-1144" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_mode</span><span class="kw">:</span><span class="at"> Literal['default', 'reduce-overhead', 'max-autotune'] | None</span></span>
-<span id="cb1-1145"><a href="#cb1-1145" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1146"><a href="#cb1-1146" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum number of iterations to train for. It precedes num_epochs which means that if</span></span>
-<span id="cb1-1147"><a href="#cb1-1147" aria-hidden="true" tabindex="-1"></a><span class="co"># both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;</span></span>
-<span id="cb1-1148"><a href="#cb1-1148" aria-hidden="true" tabindex="-1"></a><span class="co"># `num_epochs: 2` and `max_steps: 100` will train for 100 steps</span></span>
-<span id="cb1-1149"><a href="#cb1-1149" aria-hidden="true" tabindex="-1"></a><span class="fu">max_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1150"><a href="#cb1-1150" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of warmup steps. Cannot use with warmup_ratio</span></span>
-<span id="cb1-1151"><a href="#cb1-1151" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1152"><a href="#cb1-1152" aria-hidden="true" tabindex="-1"></a><span class="co"># Warmup ratio. Cannot use with warmup_steps</span></span>
-<span id="cb1-1153"><a href="#cb1-1153" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1154"><a href="#cb1-1154" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to eval at each epoch, integer for every N steps. float for fraction of</span></span>
-<span id="cb1-1155"><a href="#cb1-1155" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
-<span id="cb1-1156"><a href="#cb1-1156" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
-<span id="cb1-1157"><a href="#cb1-1157" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to run evals, mutually exclusive with eval_steps</span></span>
-<span id="cb1-1158"><a href="#cb1-1158" aria-hidden="true" tabindex="-1"></a><span class="fu">evals_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1159"><a href="#cb1-1159" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer</span></span>
-<span id="cb1-1160"><a href="#cb1-1160" aria-hidden="true" tabindex="-1"></a><span class="co"># from `eval_steps`</span></span>
-<span id="cb1-1161"><a href="#cb1-1161" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1162"><a href="#cb1-1162" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1163"><a href="#cb1-1163" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to save at each epoch, integer for every N steps. float for fraction of</span></span>
-<span id="cb1-1164"><a href="#cb1-1164" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
-<span id="cb1-1165"><a href="#cb1-1165" aria-hidden="true" tabindex="-1"></a><span class="fu">save_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
-<span id="cb1-1166"><a href="#cb1-1166" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to save a checkpoint, mutually exclusive with save_steps</span></span>
-<span id="cb1-1167"><a href="#cb1-1167" aria-hidden="true" tabindex="-1"></a><span class="fu">saves_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1168"><a href="#cb1-1168" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better</span></span>
-<span id="cb1-1169"><a href="#cb1-1169" aria-hidden="true" tabindex="-1"></a><span class="co"># result is achieved, leave empty to infer from `save_steps`</span></span>
-<span id="cb1-1170"><a href="#cb1-1170" aria-hidden="true" tabindex="-1"></a><span class="fu">save_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1171"><a href="#cb1-1171" aria-hidden="true" tabindex="-1"></a><span class="co"># Checkpoints saved at a time</span></span>
-<span id="cb1-1172"><a href="#cb1-1172" aria-hidden="true" tabindex="-1"></a><span class="fu">save_total_limit</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1173"><a href="#cb1-1173" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to checkpoint a model after the first step of training. Defaults to False.</span></span>
-<span id="cb1-1174"><a href="#cb1-1174" aria-hidden="true" tabindex="-1"></a><span class="fu">save_first_step</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1175"><a href="#cb1-1175" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1176"><a href="#cb1-1176" aria-hidden="true" tabindex="-1"></a><span class="co"># Logging frequency</span></span>
-<span id="cb1-1177"><a href="#cb1-1177" aria-hidden="true" tabindex="-1"></a><span class="fu">logging_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1178"><a href="#cb1-1178" aria-hidden="true" tabindex="-1"></a><span class="co"># Stop training after this many evaluation losses have increased in a row. https://huggi</span></span>
-<span id="cb1-1179"><a href="#cb1-1179" aria-hidden="true" tabindex="-1"></a><span class="co"># ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin</span></span>
-<span id="cb1-1180"><a href="#cb1-1180" aria-hidden="true" tabindex="-1"></a><span class="co"># gCallback</span></span>
-<span id="cb1-1181"><a href="#cb1-1181" aria-hidden="true" tabindex="-1"></a><span class="fu">early_stopping_patience</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1182"><a href="#cb1-1182" aria-hidden="true" tabindex="-1"></a><span class="fu">load_best_model_at_end</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1183"><a href="#cb1-1183" aria-hidden="true" tabindex="-1"></a><span class="co"># Save only the model weights, skipping the optimizer. Using this means you can't resume</span></span>
-<span id="cb1-1184"><a href="#cb1-1184" aria-hidden="true" tabindex="-1"></a><span class="co"># from checkpoints.</span></span>
-<span id="cb1-1185"><a href="#cb1-1185" aria-hidden="true" tabindex="-1"></a><span class="fu">save_only_model</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1186"><a href="#cb1-1186" aria-hidden="true" tabindex="-1"></a><span class="co"># Use tensorboard for logging</span></span>
-<span id="cb1-1187"><a href="#cb1-1187" aria-hidden="true" tabindex="-1"></a><span class="fu">use_tensorboard</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1188"><a href="#cb1-1188" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable the pytorch profiler to capture the first N steps of training to the</span></span>
-<span id="cb1-1189"><a href="#cb1-1189" aria-hidden="true" tabindex="-1"></a><span class="co"># output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more</span></span>
-<span id="cb1-1190"><a href="#cb1-1190" aria-hidden="true" tabindex="-1"></a><span class="co"># information. Snapshots can be visualized @ https://pytorch.org/memory_viz</span></span>
-<span id="cb1-1191"><a href="#cb1-1191" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1192"><a href="#cb1-1192" aria-hidden="true" tabindex="-1"></a><span class="co"># Which step to start the profiler at. Useful for only capturing a few steps mid-run.</span></span>
-<span id="cb1-1193"><a href="#cb1-1193" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps_start</span><span class="kw">:</span><span class="at"> int | None = 0</span></span>
-<span id="cb1-1194"><a href="#cb1-1194" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second at the end of training. This is not</span></span>
-<span id="cb1-1195"><a href="#cb1-1195" aria-hidden="true" tabindex="-1"></a><span class="co"># supported with pre-training datasets.</span></span>
-<span id="cb1-1196"><a href="#cb1-1196" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tokens_per_second</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1197"><a href="#cb1-1197" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second per-gpu during training by measuring</span></span>
-<span id="cb1-1198"><a href="#cb1-1198" aria-hidden="true" tabindex="-1"></a><span class="co"># throughput of non-padding tokens.</span></span>
-<span id="cb1-1199"><a href="#cb1-1199" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tkps</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1200"><a href="#cb1-1200" aria-hidden="true" tabindex="-1"></a><span class="co"># NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to</span></span>
-<span id="cb1-1201"><a href="#cb1-1201" aria-hidden="true" tabindex="-1"></a><span class="co"># add noise to embeddings. Currently only supported on Llama and Mistral</span></span>
-<span id="cb1-1202"><a href="#cb1-1202" aria-hidden="true" tabindex="-1"></a><span class="fu">neftune_noise_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1203"><a href="#cb1-1203" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1204"><a href="#cb1-1204" aria-hidden="true" tabindex="-1"></a><span class="co"># Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to</span></span>
-<span id="cb1-1205"><a href="#cb1-1205" aria-hidden="true" tabindex="-1"></a><span class="co"># `beta` in `ORPOConfig` due to trl mapping.</span></span>
-<span id="cb1-1206"><a href="#cb1-1206" aria-hidden="true" tabindex="-1"></a><span class="fu">orpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1207"><a href="#cb1-1207" aria-hidden="true" tabindex="-1"></a><span class="co"># Target reward margin for the SimPO loss</span></span>
-<span id="cb1-1208"><a href="#cb1-1208" aria-hidden="true" tabindex="-1"></a><span class="fu">simpo_gamma</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1209"><a href="#cb1-1209" aria-hidden="true" tabindex="-1"></a><span class="co"># Weight of the BC regularizer</span></span>
-<span id="cb1-1210"><a href="#cb1-1210" aria-hidden="true" tabindex="-1"></a><span class="fu">cpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1139"><a href="#cb1-1139" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum number of iterations to train for. It precedes num_epochs which means that if</span></span>
+<span id="cb1-1140"><a href="#cb1-1140" aria-hidden="true" tabindex="-1"></a><span class="co"># both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;</span></span>
+<span id="cb1-1141"><a href="#cb1-1141" aria-hidden="true" tabindex="-1"></a><span class="co"># `num_epochs: 2` and `max_steps: 100` will train for 100 steps</span></span>
+<span id="cb1-1142"><a href="#cb1-1142" aria-hidden="true" tabindex="-1"></a><span class="fu">max_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1143"><a href="#cb1-1143" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of warmup steps. Cannot use with warmup_ratio</span></span>
+<span id="cb1-1144"><a href="#cb1-1144" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1145"><a href="#cb1-1145" aria-hidden="true" tabindex="-1"></a><span class="co"># Warmup ratio. Cannot use with warmup_steps</span></span>
+<span id="cb1-1146"><a href="#cb1-1146" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1147"><a href="#cb1-1147" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to eval at each epoch, integer for every N steps. float for fraction of</span></span>
+<span id="cb1-1148"><a href="#cb1-1148" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
+<span id="cb1-1149"><a href="#cb1-1149" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
+<span id="cb1-1150"><a href="#cb1-1150" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to run evals, mutually exclusive with eval_steps</span></span>
+<span id="cb1-1151"><a href="#cb1-1151" aria-hidden="true" tabindex="-1"></a><span class="fu">evals_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1152"><a href="#cb1-1152" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer</span></span>
+<span id="cb1-1153"><a href="#cb1-1153" aria-hidden="true" tabindex="-1"></a><span class="co"># from `eval_steps`</span></span>
+<span id="cb1-1154"><a href="#cb1-1154" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1155"><a href="#cb1-1155" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1156"><a href="#cb1-1156" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to save at each epoch, integer for every N steps. float for fraction of</span></span>
+<span id="cb1-1157"><a href="#cb1-1157" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
+<span id="cb1-1158"><a href="#cb1-1158" aria-hidden="true" tabindex="-1"></a><span class="fu">save_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
+<span id="cb1-1159"><a href="#cb1-1159" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to save a checkpoint, mutually exclusive with save_steps</span></span>
+<span id="cb1-1160"><a href="#cb1-1160" aria-hidden="true" tabindex="-1"></a><span class="fu">saves_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1161"><a href="#cb1-1161" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better</span></span>
+<span id="cb1-1162"><a href="#cb1-1162" aria-hidden="true" tabindex="-1"></a><span class="co"># result is achieved, leave empty to infer from `save_steps`</span></span>
+<span id="cb1-1163"><a href="#cb1-1163" aria-hidden="true" tabindex="-1"></a><span class="fu">save_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1164"><a href="#cb1-1164" aria-hidden="true" tabindex="-1"></a><span class="co"># Checkpoints saved at a time</span></span>
+<span id="cb1-1165"><a href="#cb1-1165" aria-hidden="true" tabindex="-1"></a><span class="fu">save_total_limit</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1166"><a href="#cb1-1166" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to checkpoint a model after the first step of training. Defaults to False.</span></span>
+<span id="cb1-1167"><a href="#cb1-1167" aria-hidden="true" tabindex="-1"></a><span class="fu">save_first_step</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1168"><a href="#cb1-1168" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1169"><a href="#cb1-1169" aria-hidden="true" tabindex="-1"></a><span class="co"># Logging frequency</span></span>
+<span id="cb1-1170"><a href="#cb1-1170" aria-hidden="true" tabindex="-1"></a><span class="fu">logging_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1171"><a href="#cb1-1171" aria-hidden="true" tabindex="-1"></a><span class="co"># Stop training after this many evaluation losses have increased in a row. https://huggi</span></span>
+<span id="cb1-1172"><a href="#cb1-1172" aria-hidden="true" tabindex="-1"></a><span class="co"># ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin</span></span>
+<span id="cb1-1173"><a href="#cb1-1173" aria-hidden="true" tabindex="-1"></a><span class="co"># gCallback</span></span>
+<span id="cb1-1174"><a href="#cb1-1174" aria-hidden="true" tabindex="-1"></a><span class="fu">early_stopping_patience</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1175"><a href="#cb1-1175" aria-hidden="true" tabindex="-1"></a><span class="fu">load_best_model_at_end</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1176"><a href="#cb1-1176" aria-hidden="true" tabindex="-1"></a><span class="co"># Save only the model weights, skipping the optimizer. Using this means you can't resume</span></span>
+<span id="cb1-1177"><a href="#cb1-1177" aria-hidden="true" tabindex="-1"></a><span class="co"># from checkpoints.</span></span>
+<span id="cb1-1178"><a href="#cb1-1178" aria-hidden="true" tabindex="-1"></a><span class="fu">save_only_model</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1179"><a href="#cb1-1179" aria-hidden="true" tabindex="-1"></a><span class="co"># Use tensorboard for logging</span></span>
+<span id="cb1-1180"><a href="#cb1-1180" aria-hidden="true" tabindex="-1"></a><span class="fu">use_tensorboard</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1181"><a href="#cb1-1181" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable the pytorch profiler to capture the first N steps of training to the</span></span>
+<span id="cb1-1182"><a href="#cb1-1182" aria-hidden="true" tabindex="-1"></a><span class="co"># output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more</span></span>
+<span id="cb1-1183"><a href="#cb1-1183" aria-hidden="true" tabindex="-1"></a><span class="co"># information. Snapshots can be visualized @ https://pytorch.org/memory_viz</span></span>
+<span id="cb1-1184"><a href="#cb1-1184" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1185"><a href="#cb1-1185" aria-hidden="true" tabindex="-1"></a><span class="co"># Which step to start the profiler at. Useful for only capturing a few steps mid-run.</span></span>
+<span id="cb1-1186"><a href="#cb1-1186" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps_start</span><span class="kw">:</span><span class="at"> int | None = 0</span></span>
+<span id="cb1-1187"><a href="#cb1-1187" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second at the end of training. This is not</span></span>
+<span id="cb1-1188"><a href="#cb1-1188" aria-hidden="true" tabindex="-1"></a><span class="co"># supported with pre-training datasets.</span></span>
+<span id="cb1-1189"><a href="#cb1-1189" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tokens_per_second</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1190"><a href="#cb1-1190" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second per-gpu during training by measuring</span></span>
+<span id="cb1-1191"><a href="#cb1-1191" aria-hidden="true" tabindex="-1"></a><span class="co"># throughput of non-padding tokens.</span></span>
+<span id="cb1-1192"><a href="#cb1-1192" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tkps</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1193"><a href="#cb1-1193" aria-hidden="true" tabindex="-1"></a><span class="co"># NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to</span></span>
+<span id="cb1-1194"><a href="#cb1-1194" aria-hidden="true" tabindex="-1"></a><span class="co"># add noise to embeddings. Currently only supported on Llama and Mistral</span></span>
+<span id="cb1-1195"><a href="#cb1-1195" aria-hidden="true" tabindex="-1"></a><span class="fu">neftune_noise_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1196"><a href="#cb1-1196" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1197"><a href="#cb1-1197" aria-hidden="true" tabindex="-1"></a><span class="co"># Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to</span></span>
+<span id="cb1-1198"><a href="#cb1-1198" aria-hidden="true" tabindex="-1"></a><span class="co"># `beta` in `ORPOConfig` due to trl mapping.</span></span>
+<span id="cb1-1199"><a href="#cb1-1199" aria-hidden="true" tabindex="-1"></a><span class="fu">orpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1200"><a href="#cb1-1200" aria-hidden="true" tabindex="-1"></a><span class="co"># Target reward margin for the SimPO loss</span></span>
+<span id="cb1-1201"><a href="#cb1-1201" aria-hidden="true" tabindex="-1"></a><span class="fu">simpo_gamma</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1202"><a href="#cb1-1202" aria-hidden="true" tabindex="-1"></a><span class="co"># Weight of the BC regularizer</span></span>
+<span id="cb1-1203"><a href="#cb1-1203" aria-hidden="true" tabindex="-1"></a><span class="fu">cpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1204"><a href="#cb1-1204" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1205"><a href="#cb1-1205" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for desirable loss term in KTO loss</span></span>
+<span id="cb1-1206"><a href="#cb1-1206" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_desirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1207"><a href="#cb1-1207" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for undesirable loss term in KTO loss</span></span>
+<span id="cb1-1208"><a href="#cb1-1208" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_undesirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1209"><a href="#cb1-1209" aria-hidden="true" tabindex="-1"></a><span class="co"># The beta parameter for the RL training</span></span>
+<span id="cb1-1210"><a href="#cb1-1210" aria-hidden="true" tabindex="-1"></a><span class="fu">rl_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
 <span id="cb1-1211"><a href="#cb1-1211" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1212"><a href="#cb1-1212" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for desirable loss term in KTO loss</span></span>
-<span id="cb1-1213"><a href="#cb1-1213" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_desirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1214"><a href="#cb1-1214" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for undesirable loss term in KTO loss</span></span>
-<span id="cb1-1215"><a href="#cb1-1215" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_undesirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1216"><a href="#cb1-1216" aria-hidden="true" tabindex="-1"></a><span class="co"># The beta parameter for the RL training</span></span>
-<span id="cb1-1217"><a href="#cb1-1217" aria-hidden="true" tabindex="-1"></a><span class="fu">rl_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1218"><a href="#cb1-1218" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1219"><a href="#cb1-1219" aria-hidden="true" tabindex="-1"></a><span class="co"># Defines the max memory usage per gpu on the system. Passed through to transformers</span></span>
-<span id="cb1-1220"><a href="#cb1-1220" aria-hidden="true" tabindex="-1"></a><span class="co"># when loading the model.</span></span>
-<span id="cb1-1221"><a href="#cb1-1221" aria-hidden="true" tabindex="-1"></a><span class="fu">max_memory</span><span class="kw">:</span><span class="at"> dict[int | Literal['cpu', 'disk'], int | str] | None</span></span>
-<span id="cb1-1222"><a href="#cb1-1222" aria-hidden="true" tabindex="-1"></a><span class="co"># Limit the memory for all available GPUs to this amount (if an integer, expressed in</span></span>
-<span id="cb1-1223"><a href="#cb1-1223" aria-hidden="true" tabindex="-1"></a><span class="co"># gigabytes); default: unset</span></span>
-<span id="cb1-1224"><a href="#cb1-1224" aria-hidden="true" tabindex="-1"></a><span class="fu">gpu_memory_limit</span><span class="kw">:</span><span class="at"> int | str | None</span></span>
-<span id="cb1-1225"><a href="#cb1-1225" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use low_cpu_mem_usage</span></span>
-<span id="cb1-1226"><a href="#cb1-1226" aria-hidden="true" tabindex="-1"></a><span class="fu">low_cpu_mem_usage</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1227"><a href="#cb1-1227" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1228"><a href="#cb1-1228" aria-hidden="true" tabindex="-1"></a><span class="co"># The name of the chat template to use for training, following values are supported:</span></span>
-<span id="cb1-1229"><a href="#cb1-1229" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default: Uses the chat template that is available in the</span></span>
-<span id="cb1-1230"><a href="#cb1-1230" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_config.json. If the chat template is not available in the tokenizer, it will</span></span>
-<span id="cb1-1231"><a href="#cb1-1231" aria-hidden="true" tabindex="-1"></a><span class="co"># raise an error. This is the default value.</span></span>
-<span id="cb1-1232"><a href="#cb1-1232" aria-hidden="true" tabindex="-1"></a><span class="co"># alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
-<span id="cb1-1233"><a href="#cb1-1233" aria-hidden="true" tabindex="-1"></a><span class="co"># are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
-<span id="cb1-1234"><a href="#cb1-1234" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.</span></span>
-<span id="cb1-1235"><a href="#cb1-1235" aria-hidden="true" tabindex="-1"></a><span class="co"># E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not</span></span>
-<span id="cb1-1236"><a href="#cb1-1236" aria-hidden="true" tabindex="-1"></a><span class="co"># available in the tokenizer. jinja: Uses a custom jinja template for the chat template.</span></span>
-<span id="cb1-1237"><a href="#cb1-1237" aria-hidden="true" tabindex="-1"></a><span class="co"># The custom jinja template should be provided in the chat_template_jinja field. The</span></span>
-<span id="cb1-1238"><a href="#cb1-1238" aria-hidden="true" tabindex="-1"></a><span class="co"># selected chat template will be saved to the tokenizer_config.json for easier</span></span>
-<span id="cb1-1239"><a href="#cb1-1239" aria-hidden="true" tabindex="-1"></a><span class="co"># inferencing</span></span>
-<span id="cb1-1240"><a href="#cb1-1240" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None</span></span>
-<span id="cb1-1241"><a href="#cb1-1241" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom jinja template or path to jinja file for chat template. This will be only used</span></span>
-<span id="cb1-1242"><a href="#cb1-1242" aria-hidden="true" tabindex="-1"></a><span class="co"># if chat_template is set to `jinja` or `null` (in which case chat_template is</span></span>
-<span id="cb1-1243"><a href="#cb1-1243" aria-hidden="true" tabindex="-1"></a><span class="co"># automatically set to `jinja`). Default is null.</span></span>
-<span id="cb1-1244"><a href="#cb1-1244" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1245"><a href="#cb1-1245" aria-hidden="true" tabindex="-1"></a><span class="co"># Additional kwargs to pass to the chat template. This is useful for customizing the</span></span>
-<span id="cb1-1246"><a href="#cb1-1246" aria-hidden="true" tabindex="-1"></a><span class="co"># chat template. For example, you can pass `thinking=False` to add a generation prompt</span></span>
-<span id="cb1-1247"><a href="#cb1-1247" aria-hidden="true" tabindex="-1"></a><span class="co"># to the chat template.</span></span>
-<span id="cb1-1248"><a href="#cb1-1248" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1249"><a href="#cb1-1249" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the</span></span>
-<span id="cb1-1250"><a href="#cb1-1250" aria-hidden="true" tabindex="-1"></a><span class="co"># boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',</span></span>
-<span id="cb1-1251"><a href="#cb1-1251" aria-hidden="true" tabindex="-1"></a><span class="co"># '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is</span></span>
-<span id="cb1-1252"><a href="#cb1-1252" aria-hidden="true" tabindex="-1"></a><span class="co"># useful for templates that use multiple delimiter tokens.</span></span>
-<span id="cb1-1253"><a href="#cb1-1253" aria-hidden="true" tabindex="-1"></a><span class="fu">eot_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1254"><a href="#cb1-1254" aria-hidden="true" tabindex="-1"></a><span class="co"># Changes the default system message. Currently only supports chatml.</span></span>
-<span id="cb1-1255"><a href="#cb1-1255" aria-hidden="true" tabindex="-1"></a><span class="fu">default_system_message</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1212"><a href="#cb1-1212" aria-hidden="true" tabindex="-1"></a><span class="co"># Defines the max memory usage per gpu on the system. Passed through to transformers</span></span>
+<span id="cb1-1213"><a href="#cb1-1213" aria-hidden="true" tabindex="-1"></a><span class="co"># when loading the model.</span></span>
+<span id="cb1-1214"><a href="#cb1-1214" aria-hidden="true" tabindex="-1"></a><span class="fu">max_memory</span><span class="kw">:</span><span class="at"> dict[int | Literal['cpu', 'disk'], int | str] | None</span></span>
+<span id="cb1-1215"><a href="#cb1-1215" aria-hidden="true" tabindex="-1"></a><span class="co"># Limit the memory for all available GPUs to this amount (if an integer, expressed in</span></span>
+<span id="cb1-1216"><a href="#cb1-1216" aria-hidden="true" tabindex="-1"></a><span class="co"># gigabytes); default: unset</span></span>
+<span id="cb1-1217"><a href="#cb1-1217" aria-hidden="true" tabindex="-1"></a><span class="fu">gpu_memory_limit</span><span class="kw">:</span><span class="at"> int | str | None</span></span>
+<span id="cb1-1218"><a href="#cb1-1218" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use low_cpu_mem_usage</span></span>
+<span id="cb1-1219"><a href="#cb1-1219" aria-hidden="true" tabindex="-1"></a><span class="fu">low_cpu_mem_usage</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1220"><a href="#cb1-1220" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1221"><a href="#cb1-1221" aria-hidden="true" tabindex="-1"></a><span class="co"># The name of the chat template to use for training, following values are supported:</span></span>
+<span id="cb1-1222"><a href="#cb1-1222" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default: Uses the chat template that is available in the</span></span>
+<span id="cb1-1223"><a href="#cb1-1223" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_config.json. If the chat template is not available in the tokenizer, it will</span></span>
+<span id="cb1-1224"><a href="#cb1-1224" aria-hidden="true" tabindex="-1"></a><span class="co"># raise an error. This is the default value.</span></span>
+<span id="cb1-1225"><a href="#cb1-1225" aria-hidden="true" tabindex="-1"></a><span class="co"># alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
+<span id="cb1-1226"><a href="#cb1-1226" aria-hidden="true" tabindex="-1"></a><span class="co"># are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
+<span id="cb1-1227"><a href="#cb1-1227" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.</span></span>
+<span id="cb1-1228"><a href="#cb1-1228" aria-hidden="true" tabindex="-1"></a><span class="co"># E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not</span></span>
+<span id="cb1-1229"><a href="#cb1-1229" aria-hidden="true" tabindex="-1"></a><span class="co"># available in the tokenizer. jinja: Uses a custom jinja template for the chat template.</span></span>
+<span id="cb1-1230"><a href="#cb1-1230" aria-hidden="true" tabindex="-1"></a><span class="co"># The custom jinja template should be provided in the chat_template_jinja field. The</span></span>
+<span id="cb1-1231"><a href="#cb1-1231" aria-hidden="true" tabindex="-1"></a><span class="co"># selected chat template will be saved to the tokenizer_config.json for easier</span></span>
+<span id="cb1-1232"><a href="#cb1-1232" aria-hidden="true" tabindex="-1"></a><span class="co"># inferencing</span></span>
+<span id="cb1-1233"><a href="#cb1-1233" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None</span></span>
+<span id="cb1-1234"><a href="#cb1-1234" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom jinja template or path to jinja file for chat template. This will be only used</span></span>
+<span id="cb1-1235"><a href="#cb1-1235" aria-hidden="true" tabindex="-1"></a><span class="co"># if chat_template is set to `jinja` or `null` (in which case chat_template is</span></span>
+<span id="cb1-1236"><a href="#cb1-1236" aria-hidden="true" tabindex="-1"></a><span class="co"># automatically set to `jinja`). Default is null.</span></span>
+<span id="cb1-1237"><a href="#cb1-1237" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1238"><a href="#cb1-1238" aria-hidden="true" tabindex="-1"></a><span class="co"># Additional kwargs to pass to the chat template. This is useful for customizing the</span></span>
+<span id="cb1-1239"><a href="#cb1-1239" aria-hidden="true" tabindex="-1"></a><span class="co"># chat template. For example, you can pass `thinking=False` to add a generation prompt</span></span>
+<span id="cb1-1240"><a href="#cb1-1240" aria-hidden="true" tabindex="-1"></a><span class="co"># to the chat template.</span></span>
+<span id="cb1-1241"><a href="#cb1-1241" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1242"><a href="#cb1-1242" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the</span></span>
+<span id="cb1-1243"><a href="#cb1-1243" aria-hidden="true" tabindex="-1"></a><span class="co"># boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',</span></span>
+<span id="cb1-1244"><a href="#cb1-1244" aria-hidden="true" tabindex="-1"></a><span class="co"># '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is</span></span>
+<span id="cb1-1245"><a href="#cb1-1245" aria-hidden="true" tabindex="-1"></a><span class="co"># useful for templates that use multiple delimiter tokens.</span></span>
+<span id="cb1-1246"><a href="#cb1-1246" aria-hidden="true" tabindex="-1"></a><span class="fu">eot_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1247"><a href="#cb1-1247" aria-hidden="true" tabindex="-1"></a><span class="co"># Changes the default system message. Currently only supports chatml.</span></span>
+<span id="cb1-1248"><a href="#cb1-1248" aria-hidden="true" tabindex="-1"></a><span class="fu">default_system_message</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1249"><a href="#cb1-1249" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1250"><a href="#cb1-1250" aria-hidden="true" tabindex="-1"></a><span class="co"># Token index or indices to adjust embedding weights to the mean of the other tokens.</span></span>
+<span id="cb1-1251"><a href="#cb1-1251" aria-hidden="true" tabindex="-1"></a><span class="co"># This is useful when the model has untrained embeddings.</span></span>
+<span id="cb1-1252"><a href="#cb1-1252" aria-hidden="true" tabindex="-1"></a><span class="fu">fix_untrained_tokens</span><span class="kw">:</span><span class="at"> int | list[int] | None</span></span>
+<span id="cb1-1253"><a href="#cb1-1253" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1254"><a href="#cb1-1254" aria-hidden="true" tabindex="-1"></a><span class="fu">is_preprocess</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1255"><a href="#cb1-1255" aria-hidden="true" tabindex="-1"></a><span class="fu">preprocess_iterable</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-1256"><a href="#cb1-1256" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1257"><a href="#cb1-1257" aria-hidden="true" tabindex="-1"></a><span class="co"># Token index or indices to adjust embedding weights to the mean of the other tokens.</span></span>
-<span id="cb1-1258"><a href="#cb1-1258" aria-hidden="true" tabindex="-1"></a><span class="co"># This is useful when the model has untrained embeddings.</span></span>
-<span id="cb1-1259"><a href="#cb1-1259" aria-hidden="true" tabindex="-1"></a><span class="fu">fix_untrained_tokens</span><span class="kw">:</span><span class="at"> int | list[int] | None</span></span>
-<span id="cb1-1260"><a href="#cb1-1260" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1261"><a href="#cb1-1261" aria-hidden="true" tabindex="-1"></a><span class="fu">is_preprocess</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1262"><a href="#cb1-1262" aria-hidden="true" tabindex="-1"></a><span class="fu">preprocess_iterable</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1263"><a href="#cb1-1263" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1264"><a href="#cb1-1264" aria-hidden="true" tabindex="-1"></a><span class="co"># Total number of tokens - internal use</span></span>
-<span id="cb1-1265"><a href="#cb1-1265" aria-hidden="true" tabindex="-1"></a><span class="fu">total_num_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1266"><a href="#cb1-1266" aria-hidden="true" tabindex="-1"></a><span class="fu">total_supervised_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1267"><a href="#cb1-1267" aria-hidden="true" tabindex="-1"></a><span class="co"># You can set these packing optimizations AFTER starting a training at least once. The</span></span>
-<span id="cb1-1268"><a href="#cb1-1268" aria-hidden="true" tabindex="-1"></a><span class="co"># trainer will provide recommended values for these values.</span></span>
-<span id="cb1-1269"><a href="#cb1-1269" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_eff_est</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1270"><a href="#cb1-1270" aria-hidden="true" tabindex="-1"></a><span class="fu">axolotl_config_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1271"><a href="#cb1-1271" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1257"><a href="#cb1-1257" aria-hidden="true" tabindex="-1"></a><span class="co"># Total number of tokens - internal use</span></span>
+<span id="cb1-1258"><a href="#cb1-1258" aria-hidden="true" tabindex="-1"></a><span class="fu">total_num_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1259"><a href="#cb1-1259" aria-hidden="true" tabindex="-1"></a><span class="fu">total_supervised_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1260"><a href="#cb1-1260" aria-hidden="true" tabindex="-1"></a><span class="co"># You can set these packing optimizations AFTER starting a training at least once. The</span></span>
+<span id="cb1-1261"><a href="#cb1-1261" aria-hidden="true" tabindex="-1"></a><span class="co"># trainer will provide recommended values for these values.</span></span>
+<span id="cb1-1262"><a href="#cb1-1262" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_eff_est</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1263"><a href="#cb1-1263" aria-hidden="true" tabindex="-1"></a><span class="fu">axolotl_config_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1264"><a href="#cb1-1264" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1265"><a href="#cb1-1265" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
+<span id="cb1-1266"><a href="#cb1-1266" aria-hidden="true" tabindex="-1"></a><span class="fu">is_falcon_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1267"><a href="#cb1-1267" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
+<span id="cb1-1268"><a href="#cb1-1268" aria-hidden="true" tabindex="-1"></a><span class="fu">is_llama_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1269"><a href="#cb1-1269" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on. Please note that if</span></span>
+<span id="cb1-1270"><a href="#cb1-1270" aria-hidden="true" tabindex="-1"></a><span class="co"># you set this to true, `padding_side` will be set to 'left' by default</span></span>
+<span id="cb1-1271"><a href="#cb1-1271" aria-hidden="true" tabindex="-1"></a><span class="fu">is_mistral_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-1272"><a href="#cb1-1272" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1273"><a href="#cb1-1273" aria-hidden="true" tabindex="-1"></a><span class="fu">is_falcon_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1274"><a href="#cb1-1274" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1275"><a href="#cb1-1275" aria-hidden="true" tabindex="-1"></a><span class="fu">is_llama_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1276"><a href="#cb1-1276" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on. Please note that if</span></span>
-<span id="cb1-1277"><a href="#cb1-1277" aria-hidden="true" tabindex="-1"></a><span class="co"># you set this to true, `padding_side` will be set to 'left' by default</span></span>
-<span id="cb1-1278"><a href="#cb1-1278" aria-hidden="true" tabindex="-1"></a><span class="fu">is_mistral_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1279"><a href="#cb1-1279" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1280"><a href="#cb1-1280" aria-hidden="true" tabindex="-1"></a><span class="fu">is_qwen_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1281"><a href="#cb1-1281" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1282"><a href="#cb1-1282" aria-hidden="true" tabindex="-1"></a><span class="co"># Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available</span></span>
-<span id="cb1-1283"><a href="#cb1-1283" aria-hidden="true" tabindex="-1"></a><span class="co"># plugins or doc below for more details.</span></span>
-<span id="cb1-1284"><a href="#cb1-1284" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/custom_integrations.html</span></span>
-<span id="cb1-1285"><a href="#cb1-1285" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1286"><a href="#cb1-1286" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable sample generation during training for monitoring</span></span>
-<span id="cb1-1287"><a href="#cb1-1287" aria-hidden="true" tabindex="-1"></a><span class="fu">generate_samples</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1288"><a href="#cb1-1288" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of samples to generate at each interval</span></span>
-<span id="cb1-1289"><a href="#cb1-1289" aria-hidden="true" tabindex="-1"></a><span class="fu">num_generation_samples</span><span class="kw">:</span><span class="at"> int | None = 3</span></span>
-<span id="cb1-1290"><a href="#cb1-1290" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum new tokens to generate per sample</span></span>
-<span id="cb1-1291"><a href="#cb1-1291" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None = 50</span></span>
-<span id="cb1-1292"><a href="#cb1-1292" aria-hidden="true" tabindex="-1"></a><span class="co"># Temperature for sample generation (0.0 = greedy)</span></span>
-<span id="cb1-1293"><a href="#cb1-1293" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_temperature</span><span class="kw">:</span><span class="at"> float | None = 0.7</span></span>
-<span id="cb1-1294"><a href="#cb1-1294" aria-hidden="true" tabindex="-1"></a><span class="co"># Nucleus sampling parameter for generation</span></span>
-<span id="cb1-1295"><a href="#cb1-1295" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_top_p</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1296"><a href="#cb1-1296" aria-hidden="true" tabindex="-1"></a><span class="co"># Top-k sampling parameter for generation</span></span>
-<span id="cb1-1297"><a href="#cb1-1297" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_top_k</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1298"><a href="#cb1-1298" aria-hidden="true" tabindex="-1"></a><span class="co"># Ratio of input to use as prompt (0.0-1.0)</span></span>
-<span id="cb1-1299"><a href="#cb1-1299" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_prompt_ratio</span><span class="kw">:</span><span class="at"> float | None = 0.5</span></span>
-<span id="cb1-1300"><a href="#cb1-1300" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use sampling (vs greedy decoding)</span></span>
-<span id="cb1-1301"><a href="#cb1-1301" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_do_sample</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1302"><a href="#cb1-1302" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1303"><a href="#cb1-1303" aria-hidden="true" tabindex="-1"></a><span class="co"># This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This</span></span>
-<span id="cb1-1304"><a href="#cb1-1304" aria-hidden="true" tabindex="-1"></a><span class="co"># can also be a relative path to a model on disk</span></span>
-<span id="cb1-1305"><a href="#cb1-1305" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> str (required)</span></span>
-<span id="cb1-1306"><a href="#cb1-1306" aria-hidden="true" tabindex="-1"></a><span class="co"># If the base_model repo on hf hub doesn't include configuration .json files, You can</span></span>
-<span id="cb1-1307"><a href="#cb1-1307" aria-hidden="true" tabindex="-1"></a><span class="co"># set that here, or leave this empty to default to base_model</span></span>
-<span id="cb1-1308"><a href="#cb1-1308" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1309"><a href="#cb1-1309" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to</span></span>
-<span id="cb1-1310"><a href="#cb1-1310" aria-hidden="true" tabindex="-1"></a><span class="co"># AutoConfig.</span></span>
-<span id="cb1-1311"><a href="#cb1-1311" aria-hidden="true" tabindex="-1"></a><span class="fu">cls_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1312"><a href="#cb1-1312" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional tokenizer configuration path in case you want to use a different tokenizer</span></span>
-<span id="cb1-1313"><a href="#cb1-1313" aria-hidden="true" tabindex="-1"></a><span class="co"># than the one defined in the base model</span></span>
-<span id="cb1-1314"><a href="#cb1-1314" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1315"><a href="#cb1-1315" aria-hidden="true" tabindex="-1"></a><span class="co"># use_fast option for tokenizer loading from_pretrained, default to True</span></span>
-<span id="cb1-1316"><a href="#cb1-1316" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_fast</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1317"><a href="#cb1-1317" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use the legacy tokenizer setting, defaults to True</span></span>
-<span id="cb1-1318"><a href="#cb1-1318" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_legacy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1319"><a href="#cb1-1319" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use mistral-common tokenizer. If set to True, it will use the mistral-</span></span>
-<span id="cb1-1320"><a href="#cb1-1320" aria-hidden="true" tabindex="-1"></a><span class="co"># common tokenizer.</span></span>
-<span id="cb1-1321"><a href="#cb1-1321" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_mistral_common</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1322"><a href="#cb1-1322" aria-hidden="true" tabindex="-1"></a><span class="co"># Corresponding tokenizer for the model AutoTokenizer is a good choice</span></span>
-<span id="cb1-1323"><a href="#cb1-1323" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1324"><a href="#cb1-1324" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers processor class</span></span>
-<span id="cb1-1325"><a href="#cb1-1325" aria-hidden="true" tabindex="-1"></a><span class="fu">processor_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1326"><a href="#cb1-1326" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save jinja files for tokenizer, transformers default is True</span></span>
-<span id="cb1-1327"><a href="#cb1-1327" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_save_jinja_files</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1328"><a href="#cb1-1328" aria-hidden="true" tabindex="-1"></a><span class="co"># Trust remote code for untrusted source</span></span>
-<span id="cb1-1329"><a href="#cb1-1329" aria-hidden="true" tabindex="-1"></a><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1273"><a href="#cb1-1273" aria-hidden="true" tabindex="-1"></a><span class="fu">is_qwen_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1274"><a href="#cb1-1274" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1275"><a href="#cb1-1275" aria-hidden="true" tabindex="-1"></a><span class="co"># Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available</span></span>
+<span id="cb1-1276"><a href="#cb1-1276" aria-hidden="true" tabindex="-1"></a><span class="co"># plugins or doc below for more details.</span></span>
+<span id="cb1-1277"><a href="#cb1-1277" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/custom_integrations.html</span></span>
+<span id="cb1-1278"><a href="#cb1-1278" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1279"><a href="#cb1-1279" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable sample generation during training for monitoring</span></span>
+<span id="cb1-1280"><a href="#cb1-1280" aria-hidden="true" tabindex="-1"></a><span class="fu">generate_samples</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1281"><a href="#cb1-1281" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of samples to generate at each interval</span></span>
+<span id="cb1-1282"><a href="#cb1-1282" aria-hidden="true" tabindex="-1"></a><span class="fu">num_generation_samples</span><span class="kw">:</span><span class="at"> int | None = 3</span></span>
+<span id="cb1-1283"><a href="#cb1-1283" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum new tokens to generate per sample</span></span>
+<span id="cb1-1284"><a href="#cb1-1284" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None = 50</span></span>
+<span id="cb1-1285"><a href="#cb1-1285" aria-hidden="true" tabindex="-1"></a><span class="co"># Temperature for sample generation (0.0 = greedy)</span></span>
+<span id="cb1-1286"><a href="#cb1-1286" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_temperature</span><span class="kw">:</span><span class="at"> float | None = 0.7</span></span>
+<span id="cb1-1287"><a href="#cb1-1287" aria-hidden="true" tabindex="-1"></a><span class="co"># Nucleus sampling parameter for generation</span></span>
+<span id="cb1-1288"><a href="#cb1-1288" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_top_p</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1289"><a href="#cb1-1289" aria-hidden="true" tabindex="-1"></a><span class="co"># Top-k sampling parameter for generation</span></span>
+<span id="cb1-1290"><a href="#cb1-1290" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_top_k</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1291"><a href="#cb1-1291" aria-hidden="true" tabindex="-1"></a><span class="co"># Ratio of input to use as prompt (0.0-1.0)</span></span>
+<span id="cb1-1292"><a href="#cb1-1292" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_prompt_ratio</span><span class="kw">:</span><span class="at"> float | None = 0.5</span></span>
+<span id="cb1-1293"><a href="#cb1-1293" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use sampling (vs greedy decoding)</span></span>
+<span id="cb1-1294"><a href="#cb1-1294" aria-hidden="true" tabindex="-1"></a><span class="fu">generation_do_sample</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1295"><a href="#cb1-1295" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1296"><a href="#cb1-1296" aria-hidden="true" tabindex="-1"></a><span class="co"># This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This</span></span>
+<span id="cb1-1297"><a href="#cb1-1297" aria-hidden="true" tabindex="-1"></a><span class="co"># can also be a relative path to a model on disk</span></span>
+<span id="cb1-1298"><a href="#cb1-1298" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> str (required)</span></span>
+<span id="cb1-1299"><a href="#cb1-1299" aria-hidden="true" tabindex="-1"></a><span class="co"># If the base_model repo on hf hub doesn't include configuration .json files, You can</span></span>
+<span id="cb1-1300"><a href="#cb1-1300" aria-hidden="true" tabindex="-1"></a><span class="co"># set that here, or leave this empty to default to base_model</span></span>
+<span id="cb1-1301"><a href="#cb1-1301" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1302"><a href="#cb1-1302" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to</span></span>
+<span id="cb1-1303"><a href="#cb1-1303" aria-hidden="true" tabindex="-1"></a><span class="co"># AutoConfig.</span></span>
+<span id="cb1-1304"><a href="#cb1-1304" aria-hidden="true" tabindex="-1"></a><span class="fu">cls_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1305"><a href="#cb1-1305" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional tokenizer configuration path in case you want to use a different tokenizer</span></span>
+<span id="cb1-1306"><a href="#cb1-1306" aria-hidden="true" tabindex="-1"></a><span class="co"># than the one defined in the base model</span></span>
+<span id="cb1-1307"><a href="#cb1-1307" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1308"><a href="#cb1-1308" aria-hidden="true" tabindex="-1"></a><span class="co"># use_fast option for tokenizer loading from_pretrained, default to True</span></span>
+<span id="cb1-1309"><a href="#cb1-1309" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_fast</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1310"><a href="#cb1-1310" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use the legacy tokenizer setting, defaults to True</span></span>
+<span id="cb1-1311"><a href="#cb1-1311" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_legacy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1312"><a href="#cb1-1312" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use mistral-common tokenizer. If set to True, it will use the mistral-</span></span>
+<span id="cb1-1313"><a href="#cb1-1313" aria-hidden="true" tabindex="-1"></a><span class="co"># common tokenizer.</span></span>
+<span id="cb1-1314"><a href="#cb1-1314" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_mistral_common</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1315"><a href="#cb1-1315" aria-hidden="true" tabindex="-1"></a><span class="co"># Corresponding tokenizer for the model AutoTokenizer is a good choice</span></span>
+<span id="cb1-1316"><a href="#cb1-1316" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1317"><a href="#cb1-1317" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers processor class</span></span>
+<span id="cb1-1318"><a href="#cb1-1318" aria-hidden="true" tabindex="-1"></a><span class="fu">processor_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1319"><a href="#cb1-1319" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save jinja files for tokenizer, transformers default is True</span></span>
+<span id="cb1-1320"><a href="#cb1-1320" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_save_jinja_files</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1321"><a href="#cb1-1321" aria-hidden="true" tabindex="-1"></a><span class="co"># Trust remote code for untrusted source</span></span>
+<span id="cb1-1322"><a href="#cb1-1322" aria-hidden="true" tabindex="-1"></a><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1323"><a href="#cb1-1323" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1324"><a href="#cb1-1324" aria-hidden="true" tabindex="-1"></a><span class="co"># Don't move the model to the device before sharding. Set to `false` to revert to legacy</span></span>
+<span id="cb1-1325"><a href="#cb1-1325" aria-hidden="true" tabindex="-1"></a><span class="co"># behavior.</span></span>
+<span id="cb1-1326"><a href="#cb1-1326" aria-hidden="true" tabindex="-1"></a><span class="fu">experimental_skip_move_to_device</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1327"><a href="#cb1-1327" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1328"><a href="#cb1-1328" aria-hidden="true" tabindex="-1"></a><span class="co"># Use custom kernels, e.g. MegaBlocks.</span></span>
+<span id="cb1-1329"><a href="#cb1-1329" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-1330"><a href="#cb1-1330" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1331"><a href="#cb1-1331" aria-hidden="true" tabindex="-1"></a><span class="co"># Don't move the model to the device before sharding. Set to `false` to revert to legacy</span></span>
-<span id="cb1-1332"><a href="#cb1-1332" aria-hidden="true" tabindex="-1"></a><span class="co"># behavior.</span></span>
-<span id="cb1-1333"><a href="#cb1-1333" aria-hidden="true" tabindex="-1"></a><span class="fu">experimental_skip_move_to_device</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1334"><a href="#cb1-1334" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1335"><a href="#cb1-1335" aria-hidden="true" tabindex="-1"></a><span class="co"># Use custom kernels, e.g. MegaBlocks.</span></span>
-<span id="cb1-1336"><a href="#cb1-1336" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1337"><a href="#cb1-1337" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1338"><a href="#cb1-1338" aria-hidden="true" tabindex="-1"></a><span class="co"># Model loading quantization config</span></span>
-<span id="cb1-1339"><a href="#cb1-1339" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config</span><span class="kw">:</span><span class="at"> Literal['Mxfp4Config', 'FineGrainedFP8Config'] | None</span></span>
-<span id="cb1-1340"><a href="#cb1-1340" aria-hidden="true" tabindex="-1"></a><span class="co"># kwargs for model quantization config</span></span>
-<span id="cb1-1341"><a href="#cb1-1341" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1342"><a href="#cb1-1342" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1343"><a href="#cb1-1343" aria-hidden="true" tabindex="-1"></a><span class="co"># Where to save the full-finetuned model to</span></span>
-<span id="cb1-1344"><a href="#cb1-1344" aria-hidden="true" tabindex="-1"></a><span class="fu">output_dir</span><span class="kw">:</span><span class="at"> str = ./model-out</span></span>
-<span id="cb1-1345"><a href="#cb1-1345" aria-hidden="true" tabindex="-1"></a><span class="co"># push checkpoints to hub</span></span>
-<span id="cb1-1346"><a href="#cb1-1346" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_model_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1347"><a href="#cb1-1347" aria-hidden="true" tabindex="-1"></a><span class="co"># how to push checkpoints to hub</span></span>
-<span id="cb1-1348"><a href="#cb1-1348" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1349"><a href="#cb1-1349" aria-hidden="true" tabindex="-1"></a><span class="co"># branch/revision to push to on hub (default: main)</span></span>
-<span id="cb1-1350"><a href="#cb1-1350" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1351"><a href="#cb1-1351" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save the model using safetensors format. Defaults to True.</span></span>
-<span id="cb1-1352"><a href="#cb1-1352" aria-hidden="true" tabindex="-1"></a><span class="fu">save_safetensors</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1353"><a href="#cb1-1353" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1354"><a href="#cb1-1354" aria-hidden="true" tabindex="-1"></a><span class="co"># This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer</span></span>
-<span id="cb1-1355"><a href="#cb1-1355" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_8bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1356"><a href="#cb1-1356" aria-hidden="true" tabindex="-1"></a><span class="co"># Use bitsandbytes 4 bit</span></span>
-<span id="cb1-1357"><a href="#cb1-1357" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_4bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1358"><a href="#cb1-1358" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1359"><a href="#cb1-1359" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all</span></span>
-<span id="cb1-1360"><a href="#cb1-1360" aria-hidden="true" tabindex="-1"></a><span class="co"># parameters in original model</span></span>
-<span id="cb1-1361"><a href="#cb1-1361" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> Literal['lora', 'qlora', 'llama-adapter'] | None</span></span>
-<span id="cb1-1362"><a href="#cb1-1362" aria-hidden="true" tabindex="-1"></a><span class="co"># If you already have a lora model trained that you want to load, put that here. This</span></span>
-<span id="cb1-1363"><a href="#cb1-1363" aria-hidden="true" tabindex="-1"></a><span class="co"># means after training, if you want to test the model, you should set this to the value</span></span>
-<span id="cb1-1364"><a href="#cb1-1364" aria-hidden="true" tabindex="-1"></a><span class="co"># of `output_dir`. Note that if you merge an adapter to the base model, a new</span></span>
-<span id="cb1-1365"><a href="#cb1-1365" aria-hidden="true" tabindex="-1"></a><span class="co"># subdirectory `merged` will be created under the `output_dir`.</span></span>
-<span id="cb1-1366"><a href="#cb1-1366" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_model_dir</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1367"><a href="#cb1-1367" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1368"><a href="#cb1-1368" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1369"><a href="#cb1-1369" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_fan_in_fan_out</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1370"><a href="#cb1-1370" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_modules</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
-<span id="cb1-1371"><a href="#cb1-1371" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_parameters</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
-<span id="cb1-1372"><a href="#cb1-1372" aria-hidden="true" tabindex="-1"></a><span class="co"># If true, will target all linear modules</span></span>
-<span id="cb1-1373"><a href="#cb1-1373" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_linear</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1374"><a href="#cb1-1374" aria-hidden="true" tabindex="-1"></a><span class="co"># If you added new tokens to the tokenizer, you may need to save some LoRA modules</span></span>
-<span id="cb1-1375"><a href="#cb1-1375" aria-hidden="true" tabindex="-1"></a><span class="co"># because they need to know the new tokens. For LLaMA and Mistral, you need to save</span></span>
-<span id="cb1-1376"><a href="#cb1-1376" aria-hidden="true" tabindex="-1"></a><span class="co"># `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts</span></span>
-<span id="cb1-1377"><a href="#cb1-1377" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens to embeddings, and `lm_head` converts embeddings to token probabilities.</span></span>
-<span id="cb1-1378"><a href="#cb1-1378" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_modules_to_save</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1379"><a href="#cb1-1379" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_dropout</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-1380"><a href="#cb1-1380" aria-hidden="true" tabindex="-1"></a><span class="co"># The layer indices to transform, otherwise, apply to all layers</span></span>
-<span id="cb1-1381"><a href="#cb1-1381" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_to_transform</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
-<span id="cb1-1382"><a href="#cb1-1382" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_pattern</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1383"><a href="#cb1-1383" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1384"><a href="#cb1-1384" aria-hidden="true" tabindex="-1"></a><span class="fu">peft</span><span class="kw">:</span><span class="at"> PeftConfig | None</span></span>
-<span id="cb1-1385"><a href="#cb1-1385" aria-hidden="true" tabindex="-1"></a><span class="co">  # For PeftConfig:</span></span>
-<span id="cb1-1386"><a href="#cb1-1386" aria-hidden="true" tabindex="-1"></a><span class="co">  # Configuration options for loftq initialization for LoRA</span></span>
-<span id="cb1-1387"><a href="#cb1-1387" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">loftq_config</span><span class="kw">:</span><span class="at"> LoftQConfig | None</span></span>
-<span id="cb1-1388"><a href="#cb1-1388" aria-hidden="true" tabindex="-1"></a><span class="co">    # For LoftQConfig:</span></span>
-<span id="cb1-1389"><a href="#cb1-1389" aria-hidden="true" tabindex="-1"></a><span class="co">    # typically 4 bits</span></span>
-<span id="cb1-1390"><a href="#cb1-1390" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">loftq_bits</span><span class="kw">:</span><span class="at"> int = 4</span></span>
-<span id="cb1-1391"><a href="#cb1-1391" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1392"><a href="#cb1-1392" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use DoRA.</span></span>
-<span id="cb1-1393"><a href="#cb1-1393" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_dora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1394"><a href="#cb1-1394" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use RSLoRA.</span></span>
-<span id="cb1-1395"><a href="#cb1-1395" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_rslora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1396"><a href="#cb1-1396" aria-hidden="true" tabindex="-1"></a><span class="co"># List of layer indices to replicate.</span></span>
-<span id="cb1-1397"><a href="#cb1-1397" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layer_replication</span><span class="kw">:</span><span class="at"> list[tuple[int, int]] | None</span></span>
-<span id="cb1-1398"><a href="#cb1-1398" aria-hidden="true" tabindex="-1"></a><span class="co"># How to initialize LoRA weights. Default to True which is MS original implementation.</span></span>
-<span id="cb1-1399"><a href="#cb1-1399" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_init_lora_weights</span><span class="kw">:</span><span class="at"> bool | str | None</span></span>
-<span id="cb1-1400"><a href="#cb1-1400" aria-hidden="true" tabindex="-1"></a><span class="co"># A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict</span></span>
-<span id="cb1-1401"><a href="#cb1-1401" aria-hidden="true" tabindex="-1"></a><span class="co"># mapping an embedding layer name to its trainable token indices. See</span></span>
-<span id="cb1-1402"><a href="#cb1-1402" aria-hidden="true" tabindex="-1"></a><span class="co"># https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-</span></span>
-<span id="cb1-1403"><a href="#cb1-1403" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens-alongside-lora</span></span>
-<span id="cb1-1404"><a href="#cb1-1404" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_trainable_token_indices</span><span class="kw">:</span><span class="at"> list[int] | dict[str, list[int]] | None</span></span>
-<span id="cb1-1405"><a href="#cb1-1405" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to tie adapter weights for tied model weights. See</span></span>
-<span id="cb1-1406"><a href="#cb1-1406" aria-hidden="true" tabindex="-1"></a><span class="co"># https://github.com/huggingface/peft/issues/2864</span></span>
-<span id="cb1-1407"><a href="#cb1-1407" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_ensure_weight_tying</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1408"><a href="#cb1-1408" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.</span></span>
-<span id="cb1-1409"><a href="#cb1-1409" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_autocast_adapter_dtype</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1410"><a href="#cb1-1410" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1411"><a href="#cb1-1411" aria-hidden="true" tabindex="-1"></a><span class="co"># load qlora model in sharded format for FSDP using answer.ai technique.</span></span>
-<span id="cb1-1412"><a href="#cb1-1412" aria-hidden="true" tabindex="-1"></a><span class="fu">qlora_sharded_model_loading</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1413"><a href="#cb1-1413" aria-hidden="true" tabindex="-1"></a><span class="co"># Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it</span></span>
-<span id="cb1-1414"><a href="#cb1-1414" aria-hidden="true" tabindex="-1"></a><span class="co"># takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge</span></span>
-<span id="cb1-1415"><a href="#cb1-1415" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_on_cpu</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1416"><a href="#cb1-1416" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether you are training a 4-bit GPTQ quantized model</span></span>
-<span id="cb1-1417"><a href="#cb1-1417" aria-hidden="true" tabindex="-1"></a><span class="fu">gptq</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1418"><a href="#cb1-1418" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the bnb 4bit quantization configuration</span></span>
-<span id="cb1-1419"><a href="#cb1-1419" aria-hidden="true" tabindex="-1"></a><span class="fu">bnb_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1420"><a href="#cb1-1420" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1421"><a href="#cb1-1421" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.</span></span>
-<span id="cb1-1422"><a href="#cb1-1422" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1423"><a href="#cb1-1423" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate for lora embedding layers. Default value is 1e-6.</span></span>
-<span id="cb1-1424"><a href="#cb1-1424" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_embedding</span><span class="kw">:</span><span class="at"> float | None = 1e-06</span></span>
-<span id="cb1-1425"><a href="#cb1-1425" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1426"><a href="#cb1-1426" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_lora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1427"><a href="#cb1-1427" aria-hidden="true" tabindex="-1"></a><span class="co"># Method to use for LoRA merging. 'memory_efficient' (default) processes shards</span></span>
-<span id="cb1-1428"><a href="#cb1-1428" aria-hidden="true" tabindex="-1"></a><span class="co"># individually to reduce memory usage, 'legacy' loads the full model into memory.</span></span>
-<span id="cb1-1429"><a href="#cb1-1429" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_method</span><span class="kw">:</span><span class="at"> Literal['legacy', 'memory_efficient'] | None = memory_efficient</span></span>
-<span id="cb1-1430"><a href="#cb1-1430" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1431"><a href="#cb1-1431" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ReLoRA. Use with jagged_restart_*steps options.</span></span>
-<span id="cb1-1432"><a href="#cb1-1432" aria-hidden="true" tabindex="-1"></a><span class="fu">relora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1433"><a href="#cb1-1433" aria-hidden="true" tabindex="-1"></a><span class="co"># threshold for optimizer magnitude when pruning</span></span>
-<span id="cb1-1434"><a href="#cb1-1434" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_prune_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1435"><a href="#cb1-1435" aria-hidden="true" tabindex="-1"></a><span class="co"># True to perform lora weight merges on cpu during restarts, for modest gpu memory</span></span>
-<span id="cb1-1436"><a href="#cb1-1436" aria-hidden="true" tabindex="-1"></a><span class="co"># savings</span></span>
-<span id="cb1-1437"><a href="#cb1-1437" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_cpu_offload</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1331"><a href="#cb1-1331" aria-hidden="true" tabindex="-1"></a><span class="co"># Model loading quantization config</span></span>
+<span id="cb1-1332"><a href="#cb1-1332" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config</span><span class="kw">:</span><span class="at"> Literal['Mxfp4Config', 'FineGrainedFP8Config'] | None</span></span>
+<span id="cb1-1333"><a href="#cb1-1333" aria-hidden="true" tabindex="-1"></a><span class="co"># kwargs for model quantization config</span></span>
+<span id="cb1-1334"><a href="#cb1-1334" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1335"><a href="#cb1-1335" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1336"><a href="#cb1-1336" aria-hidden="true" tabindex="-1"></a><span class="co"># Where to save the full-finetuned model to</span></span>
+<span id="cb1-1337"><a href="#cb1-1337" aria-hidden="true" tabindex="-1"></a><span class="fu">output_dir</span><span class="kw">:</span><span class="at"> str = ./model-out</span></span>
+<span id="cb1-1338"><a href="#cb1-1338" aria-hidden="true" tabindex="-1"></a><span class="co"># push checkpoints to hub</span></span>
+<span id="cb1-1339"><a href="#cb1-1339" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_model_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1340"><a href="#cb1-1340" aria-hidden="true" tabindex="-1"></a><span class="co"># how to push checkpoints to hub</span></span>
+<span id="cb1-1341"><a href="#cb1-1341" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1342"><a href="#cb1-1342" aria-hidden="true" tabindex="-1"></a><span class="co"># branch/revision to push to on hub (default: main)</span></span>
+<span id="cb1-1343"><a href="#cb1-1343" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_revision</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1344"><a href="#cb1-1344" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save the model using safetensors format. Defaults to True.</span></span>
+<span id="cb1-1345"><a href="#cb1-1345" aria-hidden="true" tabindex="-1"></a><span class="fu">save_safetensors</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1346"><a href="#cb1-1346" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1347"><a href="#cb1-1347" aria-hidden="true" tabindex="-1"></a><span class="co"># This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer</span></span>
+<span id="cb1-1348"><a href="#cb1-1348" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_8bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1349"><a href="#cb1-1349" aria-hidden="true" tabindex="-1"></a><span class="co"># Use bitsandbytes 4 bit</span></span>
+<span id="cb1-1350"><a href="#cb1-1350" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_4bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1351"><a href="#cb1-1351" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1352"><a href="#cb1-1352" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all</span></span>
+<span id="cb1-1353"><a href="#cb1-1353" aria-hidden="true" tabindex="-1"></a><span class="co"># parameters in original model</span></span>
+<span id="cb1-1354"><a href="#cb1-1354" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> Literal['lora', 'qlora', 'llama-adapter'] | None</span></span>
+<span id="cb1-1355"><a href="#cb1-1355" aria-hidden="true" tabindex="-1"></a><span class="co"># If you already have a lora model trained that you want to load, put that here. This</span></span>
+<span id="cb1-1356"><a href="#cb1-1356" aria-hidden="true" tabindex="-1"></a><span class="co"># means after training, if you want to test the model, you should set this to the value</span></span>
+<span id="cb1-1357"><a href="#cb1-1357" aria-hidden="true" tabindex="-1"></a><span class="co"># of `output_dir`. Note that if you merge an adapter to the base model, a new</span></span>
+<span id="cb1-1358"><a href="#cb1-1358" aria-hidden="true" tabindex="-1"></a><span class="co"># subdirectory `merged` will be created under the `output_dir`.</span></span>
+<span id="cb1-1359"><a href="#cb1-1359" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_model_dir</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1360"><a href="#cb1-1360" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1361"><a href="#cb1-1361" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1362"><a href="#cb1-1362" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_fan_in_fan_out</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1363"><a href="#cb1-1363" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_modules</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
+<span id="cb1-1364"><a href="#cb1-1364" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_parameters</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
+<span id="cb1-1365"><a href="#cb1-1365" aria-hidden="true" tabindex="-1"></a><span class="co"># If true, will target all linear modules</span></span>
+<span id="cb1-1366"><a href="#cb1-1366" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_linear</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1367"><a href="#cb1-1367" aria-hidden="true" tabindex="-1"></a><span class="co"># If you added new tokens to the tokenizer, you may need to save some LoRA modules</span></span>
+<span id="cb1-1368"><a href="#cb1-1368" aria-hidden="true" tabindex="-1"></a><span class="co"># because they need to know the new tokens. For LLaMA and Mistral, you need to save</span></span>
+<span id="cb1-1369"><a href="#cb1-1369" aria-hidden="true" tabindex="-1"></a><span class="co"># `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts</span></span>
+<span id="cb1-1370"><a href="#cb1-1370" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens to embeddings, and `lm_head` converts embeddings to token probabilities.</span></span>
+<span id="cb1-1371"><a href="#cb1-1371" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_modules_to_save</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1372"><a href="#cb1-1372" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_dropout</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-1373"><a href="#cb1-1373" aria-hidden="true" tabindex="-1"></a><span class="co"># The layer indices to transform, otherwise, apply to all layers</span></span>
+<span id="cb1-1374"><a href="#cb1-1374" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_to_transform</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
+<span id="cb1-1375"><a href="#cb1-1375" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_pattern</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1376"><a href="#cb1-1376" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1377"><a href="#cb1-1377" aria-hidden="true" tabindex="-1"></a><span class="fu">peft</span><span class="kw">:</span><span class="at"> PeftConfig | None</span></span>
+<span id="cb1-1378"><a href="#cb1-1378" aria-hidden="true" tabindex="-1"></a><span class="co">  # For PeftConfig:</span></span>
+<span id="cb1-1379"><a href="#cb1-1379" aria-hidden="true" tabindex="-1"></a><span class="co">  # Configuration options for loftq initialization for LoRA</span></span>
+<span id="cb1-1380"><a href="#cb1-1380" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">loftq_config</span><span class="kw">:</span><span class="at"> LoftQConfig | None</span></span>
+<span id="cb1-1381"><a href="#cb1-1381" aria-hidden="true" tabindex="-1"></a><span class="co">    # For LoftQConfig:</span></span>
+<span id="cb1-1382"><a href="#cb1-1382" aria-hidden="true" tabindex="-1"></a><span class="co">    # typically 4 bits</span></span>
+<span id="cb1-1383"><a href="#cb1-1383" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">loftq_bits</span><span class="kw">:</span><span class="at"> int = 4</span></span>
+<span id="cb1-1384"><a href="#cb1-1384" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1385"><a href="#cb1-1385" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use DoRA.</span></span>
+<span id="cb1-1386"><a href="#cb1-1386" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_dora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1387"><a href="#cb1-1387" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use RSLoRA.</span></span>
+<span id="cb1-1388"><a href="#cb1-1388" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_rslora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1389"><a href="#cb1-1389" aria-hidden="true" tabindex="-1"></a><span class="co"># List of layer indices to replicate.</span></span>
+<span id="cb1-1390"><a href="#cb1-1390" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layer_replication</span><span class="kw">:</span><span class="at"> list[tuple[int, int]] | None</span></span>
+<span id="cb1-1391"><a href="#cb1-1391" aria-hidden="true" tabindex="-1"></a><span class="co"># How to initialize LoRA weights. Default to True which is MS original implementation.</span></span>
+<span id="cb1-1392"><a href="#cb1-1392" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_init_lora_weights</span><span class="kw">:</span><span class="at"> bool | str | None</span></span>
+<span id="cb1-1393"><a href="#cb1-1393" aria-hidden="true" tabindex="-1"></a><span class="co"># A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict</span></span>
+<span id="cb1-1394"><a href="#cb1-1394" aria-hidden="true" tabindex="-1"></a><span class="co"># mapping an embedding layer name to its trainable token indices. See</span></span>
+<span id="cb1-1395"><a href="#cb1-1395" aria-hidden="true" tabindex="-1"></a><span class="co"># https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-</span></span>
+<span id="cb1-1396"><a href="#cb1-1396" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens-alongside-lora</span></span>
+<span id="cb1-1397"><a href="#cb1-1397" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_trainable_token_indices</span><span class="kw">:</span><span class="at"> list[int] | dict[str, list[int]] | None</span></span>
+<span id="cb1-1398"><a href="#cb1-1398" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to tie adapter weights for tied model weights. See</span></span>
+<span id="cb1-1399"><a href="#cb1-1399" aria-hidden="true" tabindex="-1"></a><span class="co"># https://github.com/huggingface/peft/issues/2864</span></span>
+<span id="cb1-1400"><a href="#cb1-1400" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_ensure_weight_tying</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1401"><a href="#cb1-1401" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.</span></span>
+<span id="cb1-1402"><a href="#cb1-1402" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_autocast_adapter_dtype</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1403"><a href="#cb1-1403" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1404"><a href="#cb1-1404" aria-hidden="true" tabindex="-1"></a><span class="co"># load qlora model in sharded format for FSDP using answer.ai technique.</span></span>
+<span id="cb1-1405"><a href="#cb1-1405" aria-hidden="true" tabindex="-1"></a><span class="fu">qlora_sharded_model_loading</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1406"><a href="#cb1-1406" aria-hidden="true" tabindex="-1"></a><span class="co"># Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it</span></span>
+<span id="cb1-1407"><a href="#cb1-1407" aria-hidden="true" tabindex="-1"></a><span class="co"># takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge</span></span>
+<span id="cb1-1408"><a href="#cb1-1408" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_on_cpu</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1409"><a href="#cb1-1409" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether you are training a 4-bit GPTQ quantized model</span></span>
+<span id="cb1-1410"><a href="#cb1-1410" aria-hidden="true" tabindex="-1"></a><span class="fu">gptq</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1411"><a href="#cb1-1411" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the bnb 4bit quantization configuration</span></span>
+<span id="cb1-1412"><a href="#cb1-1412" aria-hidden="true" tabindex="-1"></a><span class="fu">bnb_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1413"><a href="#cb1-1413" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1414"><a href="#cb1-1414" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.</span></span>
+<span id="cb1-1415"><a href="#cb1-1415" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1416"><a href="#cb1-1416" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate for lora embedding layers. Default value is 1e-6.</span></span>
+<span id="cb1-1417"><a href="#cb1-1417" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_embedding</span><span class="kw">:</span><span class="at"> float | None = 1e-06</span></span>
+<span id="cb1-1418"><a href="#cb1-1418" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1419"><a href="#cb1-1419" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_lora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1420"><a href="#cb1-1420" aria-hidden="true" tabindex="-1"></a><span class="co"># Method to use for LoRA merging. 'memory_efficient' (default) processes shards</span></span>
+<span id="cb1-1421"><a href="#cb1-1421" aria-hidden="true" tabindex="-1"></a><span class="co"># individually to reduce memory usage, 'legacy' loads the full model into memory.</span></span>
+<span id="cb1-1422"><a href="#cb1-1422" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_method</span><span class="kw">:</span><span class="at"> Literal['legacy', 'memory_efficient'] | None = memory_efficient</span></span>
+<span id="cb1-1423"><a href="#cb1-1423" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1424"><a href="#cb1-1424" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ReLoRA. Use with jagged_restart_*steps options.</span></span>
+<span id="cb1-1425"><a href="#cb1-1425" aria-hidden="true" tabindex="-1"></a><span class="fu">relora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1426"><a href="#cb1-1426" aria-hidden="true" tabindex="-1"></a><span class="co"># threshold for optimizer magnitude when pruning</span></span>
+<span id="cb1-1427"><a href="#cb1-1427" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_prune_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1428"><a href="#cb1-1428" aria-hidden="true" tabindex="-1"></a><span class="co"># True to perform lora weight merges on cpu during restarts, for modest gpu memory</span></span>
+<span id="cb1-1429"><a href="#cb1-1429" aria-hidden="true" tabindex="-1"></a><span class="co"># savings</span></span>
+<span id="cb1-1430"><a href="#cb1-1430" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_cpu_offload</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1431"><a href="#cb1-1431" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1432"><a href="#cb1-1432" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to reset for jagged restarts</span></span>
+<span id="cb1-1433"><a href="#cb1-1433" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1434"><a href="#cb1-1434" aria-hidden="true" tabindex="-1"></a><span class="co"># how many warmup steps to take after reset for jagged restarts</span></span>
+<span id="cb1-1435"><a href="#cb1-1435" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1436"><a href="#cb1-1436" aria-hidden="true" tabindex="-1"></a><span class="co"># how many anneal steps to take before reset for jagged restarts</span></span>
+<span id="cb1-1437"><a href="#cb1-1437" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_anneal_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
 <span id="cb1-1438"><a href="#cb1-1438" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1439"><a href="#cb1-1439" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to reset for jagged restarts</span></span>
-<span id="cb1-1440"><a href="#cb1-1440" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1441"><a href="#cb1-1441" aria-hidden="true" tabindex="-1"></a><span class="co"># how many warmup steps to take after reset for jagged restarts</span></span>
-<span id="cb1-1442"><a href="#cb1-1442" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1443"><a href="#cb1-1443" aria-hidden="true" tabindex="-1"></a><span class="co"># how many anneal steps to take before reset for jagged restarts</span></span>
-<span id="cb1-1444"><a href="#cb1-1444" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_anneal_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1445"><a href="#cb1-1445" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1446"><a href="#cb1-1446" aria-hidden="true" tabindex="-1"></a><span class="co"># If greater than 1, backpropagation will be skipped and the gradients will be</span></span>
-<span id="cb1-1447"><a href="#cb1-1447" aria-hidden="true" tabindex="-1"></a><span class="co"># accumulated for the given number of steps.</span></span>
-<span id="cb1-1448"><a href="#cb1-1448" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_accumulation_steps</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1449"><a href="#cb1-1449" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples to include in each batch. This is the number of samples sent to</span></span>
-<span id="cb1-1450"><a href="#cb1-1450" aria-hidden="true" tabindex="-1"></a><span class="co"># each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps</span></span>
-<span id="cb1-1451"><a href="#cb1-1451" aria-hidden="true" tabindex="-1"></a><span class="fu">micro_batch_size</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1452"><a href="#cb1-1452" aria-hidden="true" tabindex="-1"></a><span class="co"># Total batch size, we do not recommended setting this manually</span></span>
-<span id="cb1-1453"><a href="#cb1-1453" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1454"><a href="#cb1-1454" aria-hidden="true" tabindex="-1"></a><span class="co"># per gpu micro batch size for evals, defaults to value of micro_batch_size</span></span>
-<span id="cb1-1455"><a href="#cb1-1455" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1456"><a href="#cb1-1456" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1457"><a href="#cb1-1457" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to find batch size that fits in memory. Passed to underlying transformers</span></span>
-<span id="cb1-1458"><a href="#cb1-1458" aria-hidden="true" tabindex="-1"></a><span class="co"># Trainer</span></span>
-<span id="cb1-1459"><a href="#cb1-1459" aria-hidden="true" tabindex="-1"></a><span class="fu">auto_find_batch_size</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1439"><a href="#cb1-1439" aria-hidden="true" tabindex="-1"></a><span class="co"># If greater than 1, backpropagation will be skipped and the gradients will be</span></span>
+<span id="cb1-1440"><a href="#cb1-1440" aria-hidden="true" tabindex="-1"></a><span class="co"># accumulated for the given number of steps.</span></span>
+<span id="cb1-1441"><a href="#cb1-1441" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_accumulation_steps</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
+<span id="cb1-1442"><a href="#cb1-1442" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples to include in each batch. This is the number of samples sent to</span></span>
+<span id="cb1-1443"><a href="#cb1-1443" aria-hidden="true" tabindex="-1"></a><span class="co"># each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps</span></span>
+<span id="cb1-1444"><a href="#cb1-1444" aria-hidden="true" tabindex="-1"></a><span class="fu">micro_batch_size</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
+<span id="cb1-1445"><a href="#cb1-1445" aria-hidden="true" tabindex="-1"></a><span class="co"># Total batch size, we do not recommended setting this manually</span></span>
+<span id="cb1-1446"><a href="#cb1-1446" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1447"><a href="#cb1-1447" aria-hidden="true" tabindex="-1"></a><span class="co"># per gpu micro batch size for evals, defaults to value of micro_batch_size</span></span>
+<span id="cb1-1448"><a href="#cb1-1448" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1449"><a href="#cb1-1449" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1450"><a href="#cb1-1450" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to find batch size that fits in memory. Passed to underlying transformers</span></span>
+<span id="cb1-1451"><a href="#cb1-1451" aria-hidden="true" tabindex="-1"></a><span class="co"># Trainer</span></span>
+<span id="cb1-1452"><a href="#cb1-1452" aria-hidden="true" tabindex="-1"></a><span class="fu">auto_find_batch_size</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1453"><a href="#cb1-1453" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1454"><a href="#cb1-1454" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to mask out or include the human's prompt from the training labels</span></span>
+<span id="cb1-1455"><a href="#cb1-1455" aria-hidden="true" tabindex="-1"></a><span class="fu">train_on_inputs</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1456"><a href="#cb1-1456" aria-hidden="true" tabindex="-1"></a><span class="co"># Group similarly sized data to minimize padding. May be slower to start, as it must</span></span>
+<span id="cb1-1457"><a href="#cb1-1457" aria-hidden="true" tabindex="-1"></a><span class="co"># download and sort the entire dataset. Note that training loss may have an oscillating</span></span>
+<span id="cb1-1458"><a href="#cb1-1458" aria-hidden="true" tabindex="-1"></a><span class="co"># pattern with this enabled.</span></span>
+<span id="cb1-1459"><a href="#cb1-1459" aria-hidden="true" tabindex="-1"></a><span class="fu">group_by_length</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-1460"><a href="#cb1-1460" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1461"><a href="#cb1-1461" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to mask out or include the human's prompt from the training labels</span></span>
-<span id="cb1-1462"><a href="#cb1-1462" aria-hidden="true" tabindex="-1"></a><span class="fu">train_on_inputs</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1463"><a href="#cb1-1463" aria-hidden="true" tabindex="-1"></a><span class="co"># Group similarly sized data to minimize padding. May be slower to start, as it must</span></span>
-<span id="cb1-1464"><a href="#cb1-1464" aria-hidden="true" tabindex="-1"></a><span class="co"># download and sort the entire dataset. Note that training loss may have an oscillating</span></span>
-<span id="cb1-1465"><a href="#cb1-1465" aria-hidden="true" tabindex="-1"></a><span class="co"># pattern with this enabled.</span></span>
-<span id="cb1-1466"><a href="#cb1-1466" aria-hidden="true" tabindex="-1"></a><span class="fu">group_by_length</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1467"><a href="#cb1-1467" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1468"><a href="#cb1-1468" aria-hidden="true" tabindex="-1"></a><span class="fu">learning_rate</span><span class="kw">:</span><span class="at"> str | float (required)</span></span>
-<span id="cb1-1469"><a href="#cb1-1469" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1470"><a href="#cb1-1470" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr_scale</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1471"><a href="#cb1-1471" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify weight decay</span></span>
-<span id="cb1-1472"><a href="#cb1-1472" aria-hidden="true" tabindex="-1"></a><span class="fu">weight_decay</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-1473"><a href="#cb1-1473" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify optimizer</span></span>
-<span id="cb1-1474"><a href="#cb1-1474" aria-hidden="true" tabindex="-1"></a><span class="fu">optimizer</span><span class="kw">:</span><span class="at"> OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED</span></span>
-<span id="cb1-1475"><a href="#cb1-1475" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary of arguments to pass to the optimizer</span></span>
-<span id="cb1-1476"><a href="#cb1-1476" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_args</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
-<span id="cb1-1477"><a href="#cb1-1477" aria-hidden="true" tabindex="-1"></a><span class="co"># The target modules to optimize, i.e. the module names that you would like to train,</span></span>
-<span id="cb1-1478"><a href="#cb1-1478" aria-hidden="true" tabindex="-1"></a><span class="co"># right now this is used only for GaLore algorithm</span></span>
-<span id="cb1-1479"><a href="#cb1-1479" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_target_modules</span><span class="kw">:</span><span class="at"> list[str] | Literal['all_linear'] | None</span></span>
-<span id="cb1-1480"><a href="#cb1-1480" aria-hidden="true" tabindex="-1"></a><span class="co"># Path to torch distx for optim 'adamw_anyprecision'</span></span>
-<span id="cb1-1481"><a href="#cb1-1481" aria-hidden="true" tabindex="-1"></a><span class="fu">torchdistx_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1482"><a href="#cb1-1482" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler</span><span class="kw">:</span><span class="at"> SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE</span></span>
-<span id="cb1-1483"><a href="#cb1-1483" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a scheduler and kwargs to use with the optimizer</span></span>
-<span id="cb1-1484"><a href="#cb1-1484" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1485"><a href="#cb1-1485" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_quadratic_warmup</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1486"><a href="#cb1-1486" aria-hidden="true" tabindex="-1"></a><span class="co"># decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of</span></span>
-<span id="cb1-1487"><a href="#cb1-1487" aria-hidden="true" tabindex="-1"></a><span class="co"># peak lr</span></span>
-<span id="cb1-1488"><a href="#cb1-1488" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_min_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1489"><a href="#cb1-1489" aria-hidden="true" tabindex="-1"></a><span class="co"># freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means</span></span>
-<span id="cb1-1490"><a href="#cb1-1490" aria-hidden="true" tabindex="-1"></a><span class="co"># start cosine_min_lr at 80% of training step</span></span>
-<span id="cb1-1491"><a href="#cb1-1491" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_constant_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1492"><a href="#cb1-1492" aria-hidden="true" tabindex="-1"></a><span class="co"># Learning rate div factor</span></span>
-<span id="cb1-1493"><a href="#cb1-1493" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_div_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1494"><a href="#cb1-1494" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1495"><a href="#cb1-1495" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_groups</span><span class="kw">:</span><span class="at"> list[LrGroup] | None</span></span>
-<span id="cb1-1496"><a href="#cb1-1496" aria-hidden="true" tabindex="-1"></a><span class="co">  # For LrGroup:</span></span>
-<span id="cb1-1497"><a href="#cb1-1497" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str (required)</span></span>
-<span id="cb1-1498"><a href="#cb1-1498" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">modules</span><span class="kw">:</span><span class="at"> list[str] (required)</span></span>
-<span id="cb1-1499"><a href="#cb1-1499" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">lr</span><span class="kw">:</span><span class="at"> float (required)</span></span>
-<span id="cb1-1500"><a href="#cb1-1500" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1501"><a href="#cb1-1501" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1502"><a href="#cb1-1502" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1503"><a href="#cb1-1503" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
-<span id="cb1-1504"><a href="#cb1-1504" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon2</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1505"><a href="#cb1-1505" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1506"><a href="#cb1-1506" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta1</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1507"><a href="#cb1-1507" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1508"><a href="#cb1-1508" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta2</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1509"><a href="#cb1-1509" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
-<span id="cb1-1510"><a href="#cb1-1510" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta3</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1511"><a href="#cb1-1511" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1512"><a href="#cb1-1512" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer learning rate</span></span>
-<span id="cb1-1513"><a href="#cb1-1513" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1514"><a href="#cb1-1514" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer momentum</span></span>
-<span id="cb1-1515"><a href="#cb1-1515" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_momentum</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1516"><a href="#cb1-1516" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank</span></span>
-<span id="cb1-1517"><a href="#cb1-1517" aria-hidden="true" tabindex="-1"></a><span class="co"># dimension.</span></span>
-<span id="cb1-1518"><a href="#cb1-1518" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_fraction</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
-<span id="cb1-1519"><a href="#cb1-1519" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may</span></span>
-<span id="cb1-1520"><a href="#cb1-1520" aria-hidden="true" tabindex="-1"></a><span class="co"># be useful to ensure even sharding.</span></span>
-<span id="cb1-1521"><a href="#cb1-1521" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_multiple_of</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1522"><a href="#cb1-1522" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1523"><a href="#cb1-1523" aria-hidden="true" tabindex="-1"></a><span class="co"># Gradient clipping max norm</span></span>
-<span id="cb1-1524"><a href="#cb1-1524" aria-hidden="true" tabindex="-1"></a><span class="fu">max_grad_norm</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1525"><a href="#cb1-1525" aria-hidden="true" tabindex="-1"></a><span class="fu">num_epochs</span><span class="kw">:</span><span class="at"> float = 1.0</span></span>
-<span id="cb1-1526"><a href="#cb1-1526" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1527"><a href="#cb1-1527" aria-hidden="true" tabindex="-1"></a><span class="fu">use_wandb</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1528"><a href="#cb1-1528" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your wandb run</span></span>
-<span id="cb1-1529"><a href="#cb1-1529" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1530"><a href="#cb1-1530" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the ID of your wandb run</span></span>
-<span id="cb1-1531"><a href="#cb1-1531" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_run_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1532"><a href="#cb1-1532" aria-hidden="true" tabindex="-1"></a><span class="co"># "offline" to save run metadata locally and not sync to the server, "disabled" to turn</span></span>
-<span id="cb1-1533"><a href="#cb1-1533" aria-hidden="true" tabindex="-1"></a><span class="co"># off wandb</span></span>
-<span id="cb1-1534"><a href="#cb1-1534" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1535"><a href="#cb1-1535" aria-hidden="true" tabindex="-1"></a><span class="co"># Your wandb project name</span></span>
-<span id="cb1-1536"><a href="#cb1-1536" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_project</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1537"><a href="#cb1-1537" aria-hidden="true" tabindex="-1"></a><span class="co"># A wandb Team name if using a Team</span></span>
-<span id="cb1-1538"><a href="#cb1-1538" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_entity</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1539"><a href="#cb1-1539" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_watch</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1540"><a href="#cb1-1540" aria-hidden="true" tabindex="-1"></a><span class="co"># "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only</span></span>
-<span id="cb1-1541"><a href="#cb1-1541" aria-hidden="true" tabindex="-1"></a><span class="co"># at the end of training</span></span>
-<span id="cb1-1542"><a href="#cb1-1542" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_log_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1543"><a href="#cb1-1543" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1544"><a href="#cb1-1544" aria-hidden="true" tabindex="-1"></a><span class="fu">use_mlflow</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1545"><a href="#cb1-1545" aria-hidden="true" tabindex="-1"></a><span class="co"># URI to mlflow</span></span>
-<span id="cb1-1546"><a href="#cb1-1546" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_tracking_uri</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1547"><a href="#cb1-1547" aria-hidden="true" tabindex="-1"></a><span class="co"># Your experiment name</span></span>
-<span id="cb1-1548"><a href="#cb1-1548" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_experiment_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1549"><a href="#cb1-1549" aria-hidden="true" tabindex="-1"></a><span class="co"># Your run name</span></span>
-<span id="cb1-1550"><a href="#cb1-1550" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1551"><a href="#cb1-1551" aria-hidden="true" tabindex="-1"></a><span class="co"># set to true to copy each saved checkpoint on each save to mlflow artifact registry</span></span>
-<span id="cb1-1552"><a href="#cb1-1552" aria-hidden="true" tabindex="-1"></a><span class="fu">hf_mlflow_log_artifacts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1553"><a href="#cb1-1553" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1554"><a href="#cb1-1554" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable or disable Comet integration.</span></span>
-<span id="cb1-1555"><a href="#cb1-1555" aria-hidden="true" tabindex="-1"></a><span class="fu">use_comet</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1556"><a href="#cb1-1556" aria-hidden="true" tabindex="-1"></a><span class="co"># API key for Comet. Recommended to set via `comet login`.</span></span>
-<span id="cb1-1557"><a href="#cb1-1557" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_api_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1558"><a href="#cb1-1558" aria-hidden="true" tabindex="-1"></a><span class="co"># Workspace name in Comet. Defaults to the user's default workspace.</span></span>
-<span id="cb1-1559"><a href="#cb1-1559" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_workspace</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1560"><a href="#cb1-1560" aria-hidden="true" tabindex="-1"></a><span class="co"># Project name in Comet. Defaults to Uncategorized.</span></span>
-<span id="cb1-1561"><a href="#cb1-1561" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1562"><a href="#cb1-1562" aria-hidden="true" tabindex="-1"></a><span class="co"># Identifier for the experiment. Used to append data to an existing experiment or</span></span>
-<span id="cb1-1563"><a href="#cb1-1563" aria-hidden="true" tabindex="-1"></a><span class="co"># control the key of new experiments. Default to a random key.</span></span>
-<span id="cb1-1564"><a href="#cb1-1564" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1565"><a href="#cb1-1565" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a new experiment ("create") or log to an existing one ("get"). Default</span></span>
-<span id="cb1-1566"><a href="#cb1-1566" aria-hidden="true" tabindex="-1"></a><span class="co"># ("get_or_create") auto-selects based on configuration.</span></span>
-<span id="cb1-1567"><a href="#cb1-1567" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1568"><a href="#cb1-1568" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to True to log data to Comet server, or False for offline storage. Default is</span></span>
-<span id="cb1-1569"><a href="#cb1-1569" aria-hidden="true" tabindex="-1"></a><span class="co"># True.</span></span>
-<span id="cb1-1570"><a href="#cb1-1570" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_online</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1571"><a href="#cb1-1571" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary for additional configuration settings, see the doc for more details.</span></span>
-<span id="cb1-1572"><a href="#cb1-1572" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1573"><a href="#cb1-1573" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1574"><a href="#cb1-1574" aria-hidden="true" tabindex="-1"></a><span class="fu">use_trackio</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1575"><a href="#cb1-1575" aria-hidden="true" tabindex="-1"></a><span class="co"># Your trackio project name</span></span>
-<span id="cb1-1576"><a href="#cb1-1576" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1577"><a href="#cb1-1577" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your trackio run</span></span>
-<span id="cb1-1578"><a href="#cb1-1578" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1579"><a href="#cb1-1579" aria-hidden="true" tabindex="-1"></a><span class="co"># Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)</span></span>
-<span id="cb1-1580"><a href="#cb1-1580" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_space_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1461"><a href="#cb1-1461" aria-hidden="true" tabindex="-1"></a><span class="fu">learning_rate</span><span class="kw">:</span><span class="at"> str | float (required)</span></span>
+<span id="cb1-1462"><a href="#cb1-1462" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1463"><a href="#cb1-1463" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr_scale</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1464"><a href="#cb1-1464" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify weight decay</span></span>
+<span id="cb1-1465"><a href="#cb1-1465" aria-hidden="true" tabindex="-1"></a><span class="fu">weight_decay</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-1466"><a href="#cb1-1466" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify optimizer</span></span>
+<span id="cb1-1467"><a href="#cb1-1467" aria-hidden="true" tabindex="-1"></a><span class="fu">optimizer</span><span class="kw">:</span><span class="at"> OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED</span></span>
+<span id="cb1-1468"><a href="#cb1-1468" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary of arguments to pass to the optimizer</span></span>
+<span id="cb1-1469"><a href="#cb1-1469" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_args</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
+<span id="cb1-1470"><a href="#cb1-1470" aria-hidden="true" tabindex="-1"></a><span class="co"># The target modules to optimize, i.e. the module names that you would like to train,</span></span>
+<span id="cb1-1471"><a href="#cb1-1471" aria-hidden="true" tabindex="-1"></a><span class="co"># right now this is used only for GaLore algorithm</span></span>
+<span id="cb1-1472"><a href="#cb1-1472" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_target_modules</span><span class="kw">:</span><span class="at"> list[str] | Literal['all_linear'] | None</span></span>
+<span id="cb1-1473"><a href="#cb1-1473" aria-hidden="true" tabindex="-1"></a><span class="co"># Path to torch distx for optim 'adamw_anyprecision'</span></span>
+<span id="cb1-1474"><a href="#cb1-1474" aria-hidden="true" tabindex="-1"></a><span class="fu">torchdistx_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1475"><a href="#cb1-1475" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler</span><span class="kw">:</span><span class="at"> SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE</span></span>
+<span id="cb1-1476"><a href="#cb1-1476" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a scheduler and kwargs to use with the optimizer</span></span>
+<span id="cb1-1477"><a href="#cb1-1477" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1478"><a href="#cb1-1478" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_quadratic_warmup</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1479"><a href="#cb1-1479" aria-hidden="true" tabindex="-1"></a><span class="co"># decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of</span></span>
+<span id="cb1-1480"><a href="#cb1-1480" aria-hidden="true" tabindex="-1"></a><span class="co"># peak lr</span></span>
+<span id="cb1-1481"><a href="#cb1-1481" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_min_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1482"><a href="#cb1-1482" aria-hidden="true" tabindex="-1"></a><span class="co"># freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means</span></span>
+<span id="cb1-1483"><a href="#cb1-1483" aria-hidden="true" tabindex="-1"></a><span class="co"># start cosine_min_lr at 80% of training step</span></span>
+<span id="cb1-1484"><a href="#cb1-1484" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_constant_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1485"><a href="#cb1-1485" aria-hidden="true" tabindex="-1"></a><span class="co"># Learning rate div factor</span></span>
+<span id="cb1-1486"><a href="#cb1-1486" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_div_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1487"><a href="#cb1-1487" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1488"><a href="#cb1-1488" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_groups</span><span class="kw">:</span><span class="at"> list[LrGroup] | None</span></span>
+<span id="cb1-1489"><a href="#cb1-1489" aria-hidden="true" tabindex="-1"></a><span class="co">  # For LrGroup:</span></span>
+<span id="cb1-1490"><a href="#cb1-1490" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str (required)</span></span>
+<span id="cb1-1491"><a href="#cb1-1491" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">modules</span><span class="kw">:</span><span class="at"> list[str] (required)</span></span>
+<span id="cb1-1492"><a href="#cb1-1492" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">lr</span><span class="kw">:</span><span class="at"> float (required)</span></span>
+<span id="cb1-1493"><a href="#cb1-1493" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1494"><a href="#cb1-1494" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
+<span id="cb1-1495"><a href="#cb1-1495" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1496"><a href="#cb1-1496" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
+<span id="cb1-1497"><a href="#cb1-1497" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon2</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1498"><a href="#cb1-1498" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
+<span id="cb1-1499"><a href="#cb1-1499" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta1</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1500"><a href="#cb1-1500" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
+<span id="cb1-1501"><a href="#cb1-1501" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta2</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1502"><a href="#cb1-1502" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
+<span id="cb1-1503"><a href="#cb1-1503" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta3</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1504"><a href="#cb1-1504" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1505"><a href="#cb1-1505" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer learning rate</span></span>
+<span id="cb1-1506"><a href="#cb1-1506" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1507"><a href="#cb1-1507" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer momentum</span></span>
+<span id="cb1-1508"><a href="#cb1-1508" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_momentum</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1509"><a href="#cb1-1509" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank</span></span>
+<span id="cb1-1510"><a href="#cb1-1510" aria-hidden="true" tabindex="-1"></a><span class="co"># dimension.</span></span>
+<span id="cb1-1511"><a href="#cb1-1511" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_fraction</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
+<span id="cb1-1512"><a href="#cb1-1512" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may</span></span>
+<span id="cb1-1513"><a href="#cb1-1513" aria-hidden="true" tabindex="-1"></a><span class="co"># be useful to ensure even sharding.</span></span>
+<span id="cb1-1514"><a href="#cb1-1514" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_multiple_of</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
+<span id="cb1-1515"><a href="#cb1-1515" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1516"><a href="#cb1-1516" aria-hidden="true" tabindex="-1"></a><span class="co"># Gradient clipping max norm</span></span>
+<span id="cb1-1517"><a href="#cb1-1517" aria-hidden="true" tabindex="-1"></a><span class="fu">max_grad_norm</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1518"><a href="#cb1-1518" aria-hidden="true" tabindex="-1"></a><span class="fu">num_epochs</span><span class="kw">:</span><span class="at"> float = 1.0</span></span>
+<span id="cb1-1519"><a href="#cb1-1519" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1520"><a href="#cb1-1520" aria-hidden="true" tabindex="-1"></a><span class="fu">use_wandb</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1521"><a href="#cb1-1521" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your wandb run</span></span>
+<span id="cb1-1522"><a href="#cb1-1522" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1523"><a href="#cb1-1523" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the ID of your wandb run</span></span>
+<span id="cb1-1524"><a href="#cb1-1524" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_run_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1525"><a href="#cb1-1525" aria-hidden="true" tabindex="-1"></a><span class="co"># "offline" to save run metadata locally and not sync to the server, "disabled" to turn</span></span>
+<span id="cb1-1526"><a href="#cb1-1526" aria-hidden="true" tabindex="-1"></a><span class="co"># off wandb</span></span>
+<span id="cb1-1527"><a href="#cb1-1527" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1528"><a href="#cb1-1528" aria-hidden="true" tabindex="-1"></a><span class="co"># Your wandb project name</span></span>
+<span id="cb1-1529"><a href="#cb1-1529" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_project</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1530"><a href="#cb1-1530" aria-hidden="true" tabindex="-1"></a><span class="co"># A wandb Team name if using a Team</span></span>
+<span id="cb1-1531"><a href="#cb1-1531" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_entity</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1532"><a href="#cb1-1532" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_watch</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1533"><a href="#cb1-1533" aria-hidden="true" tabindex="-1"></a><span class="co"># "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only</span></span>
+<span id="cb1-1534"><a href="#cb1-1534" aria-hidden="true" tabindex="-1"></a><span class="co"># at the end of training</span></span>
+<span id="cb1-1535"><a href="#cb1-1535" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_log_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1536"><a href="#cb1-1536" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1537"><a href="#cb1-1537" aria-hidden="true" tabindex="-1"></a><span class="fu">use_mlflow</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1538"><a href="#cb1-1538" aria-hidden="true" tabindex="-1"></a><span class="co"># URI to mlflow</span></span>
+<span id="cb1-1539"><a href="#cb1-1539" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_tracking_uri</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1540"><a href="#cb1-1540" aria-hidden="true" tabindex="-1"></a><span class="co"># Your experiment name</span></span>
+<span id="cb1-1541"><a href="#cb1-1541" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_experiment_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1542"><a href="#cb1-1542" aria-hidden="true" tabindex="-1"></a><span class="co"># Your run name</span></span>
+<span id="cb1-1543"><a href="#cb1-1543" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1544"><a href="#cb1-1544" aria-hidden="true" tabindex="-1"></a><span class="co"># set to true to copy each saved checkpoint on each save to mlflow artifact registry</span></span>
+<span id="cb1-1545"><a href="#cb1-1545" aria-hidden="true" tabindex="-1"></a><span class="fu">hf_mlflow_log_artifacts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1546"><a href="#cb1-1546" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1547"><a href="#cb1-1547" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable or disable Comet integration.</span></span>
+<span id="cb1-1548"><a href="#cb1-1548" aria-hidden="true" tabindex="-1"></a><span class="fu">use_comet</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1549"><a href="#cb1-1549" aria-hidden="true" tabindex="-1"></a><span class="co"># API key for Comet. Recommended to set via `comet login`.</span></span>
+<span id="cb1-1550"><a href="#cb1-1550" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_api_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1551"><a href="#cb1-1551" aria-hidden="true" tabindex="-1"></a><span class="co"># Workspace name in Comet. Defaults to the user's default workspace.</span></span>
+<span id="cb1-1552"><a href="#cb1-1552" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_workspace</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1553"><a href="#cb1-1553" aria-hidden="true" tabindex="-1"></a><span class="co"># Project name in Comet. Defaults to Uncategorized.</span></span>
+<span id="cb1-1554"><a href="#cb1-1554" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1555"><a href="#cb1-1555" aria-hidden="true" tabindex="-1"></a><span class="co"># Identifier for the experiment. Used to append data to an existing experiment or</span></span>
+<span id="cb1-1556"><a href="#cb1-1556" aria-hidden="true" tabindex="-1"></a><span class="co"># control the key of new experiments. Default to a random key.</span></span>
+<span id="cb1-1557"><a href="#cb1-1557" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1558"><a href="#cb1-1558" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a new experiment ("create") or log to an existing one ("get"). Default</span></span>
+<span id="cb1-1559"><a href="#cb1-1559" aria-hidden="true" tabindex="-1"></a><span class="co"># ("get_or_create") auto-selects based on configuration.</span></span>
+<span id="cb1-1560"><a href="#cb1-1560" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1561"><a href="#cb1-1561" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to True to log data to Comet server, or False for offline storage. Default is</span></span>
+<span id="cb1-1562"><a href="#cb1-1562" aria-hidden="true" tabindex="-1"></a><span class="co"># True.</span></span>
+<span id="cb1-1563"><a href="#cb1-1563" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_online</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1564"><a href="#cb1-1564" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary for additional configuration settings, see the doc for more details.</span></span>
+<span id="cb1-1565"><a href="#cb1-1565" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1566"><a href="#cb1-1566" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1567"><a href="#cb1-1567" aria-hidden="true" tabindex="-1"></a><span class="fu">use_trackio</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1568"><a href="#cb1-1568" aria-hidden="true" tabindex="-1"></a><span class="co"># Your trackio project name</span></span>
+<span id="cb1-1569"><a href="#cb1-1569" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1570"><a href="#cb1-1570" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your trackio run</span></span>
+<span id="cb1-1571"><a href="#cb1-1571" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1572"><a href="#cb1-1572" aria-hidden="true" tabindex="-1"></a><span class="co"># Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)</span></span>
+<span id="cb1-1573"><a href="#cb1-1573" aria-hidden="true" tabindex="-1"></a><span class="fu">trackio_space_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1574"><a href="#cb1-1574" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1575"><a href="#cb1-1575" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable OpenTelemetry metrics collection and Prometheus export</span></span>
+<span id="cb1-1576"><a href="#cb1-1576" aria-hidden="true" tabindex="-1"></a><span class="fu">use_otel_metrics</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1577"><a href="#cb1-1577" aria-hidden="true" tabindex="-1"></a><span class="co"># Host to bind the OpenTelemetry metrics server to</span></span>
+<span id="cb1-1578"><a href="#cb1-1578" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_host</span><span class="kw">:</span><span class="at"> str | None = localhost</span></span>
+<span id="cb1-1579"><a href="#cb1-1579" aria-hidden="true" tabindex="-1"></a><span class="co"># Port for the Prometheus metrics HTTP server</span></span>
+<span id="cb1-1580"><a href="#cb1-1580" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_port</span><span class="kw">:</span><span class="at"> int | None = 8000</span></span>
 <span id="cb1-1581"><a href="#cb1-1581" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1582"><a href="#cb1-1582" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable OpenTelemetry metrics collection and Prometheus export</span></span>
-<span id="cb1-1583"><a href="#cb1-1583" aria-hidden="true" tabindex="-1"></a><span class="fu">use_otel_metrics</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1584"><a href="#cb1-1584" aria-hidden="true" tabindex="-1"></a><span class="co"># Host to bind the OpenTelemetry metrics server to</span></span>
-<span id="cb1-1585"><a href="#cb1-1585" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_host</span><span class="kw">:</span><span class="at"> str | None = localhost</span></span>
-<span id="cb1-1586"><a href="#cb1-1586" aria-hidden="true" tabindex="-1"></a><span class="co"># Port for the Prometheus metrics HTTP server</span></span>
-<span id="cb1-1587"><a href="#cb1-1587" aria-hidden="true" tabindex="-1"></a><span class="fu">otel_metrics_port</span><span class="kw">:</span><span class="at"> int | None = 8000</span></span>
+<span id="cb1-1582"><a href="#cb1-1582" aria-hidden="true" tabindex="-1"></a><span class="co"># the number of activate layers in LISA</span></span>
+<span id="cb1-1583"><a href="#cb1-1583" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_n_layers</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1584"><a href="#cb1-1584" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to switch layers in LISA</span></span>
+<span id="cb1-1585"><a href="#cb1-1585" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_step_interval</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1586"><a href="#cb1-1586" aria-hidden="true" tabindex="-1"></a><span class="co"># path under the model to access the layers</span></span>
+<span id="cb1-1587"><a href="#cb1-1587" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_layers_attribute</span><span class="kw">:</span><span class="at"> str | None = model.layers</span></span>
 <span id="cb1-1588"><a href="#cb1-1588" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1589"><a href="#cb1-1589" aria-hidden="true" tabindex="-1"></a><span class="co"># the number of activate layers in LISA</span></span>
-<span id="cb1-1590"><a href="#cb1-1590" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_n_layers</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1591"><a href="#cb1-1591" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to switch layers in LISA</span></span>
-<span id="cb1-1592"><a href="#cb1-1592" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_step_interval</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1593"><a href="#cb1-1593" aria-hidden="true" tabindex="-1"></a><span class="co"># path under the model to access the layers</span></span>
-<span id="cb1-1594"><a href="#cb1-1594" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_layers_attribute</span><span class="kw">:</span><span class="at"> str | None = model.layers</span></span>
+<span id="cb1-1589"><a href="#cb1-1589" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_title</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1590"><a href="#cb1-1590" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_share</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1591"><a href="#cb1-1591" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1592"><a href="#cb1-1592" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_port</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1593"><a href="#cb1-1593" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1594"><a href="#cb1-1594" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
 <span id="cb1-1595"><a href="#cb1-1595" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1596"><a href="#cb1-1596" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_title</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1597"><a href="#cb1-1597" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_share</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1598"><a href="#cb1-1598" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1599"><a href="#cb1-1599" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_port</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1600"><a href="#cb1-1600" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1601"><a href="#cb1-1601" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1602"><a href="#cb1-1602" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1603"><a href="#cb1-1603" aria-hidden="true" tabindex="-1"></a><span class="fu">use_ray</span><span class="kw">:</span><span class="at"> bool = False</span></span>
-<span id="cb1-1604"><a href="#cb1-1604" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1605"><a href="#cb1-1605" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_num_workers</span><span class="kw">:</span><span class="at"> int = 1</span></span>
-<span id="cb1-1606"><a href="#cb1-1606" aria-hidden="true" tabindex="-1"></a><span class="fu">resources_per_worker</span><span class="kw">:</span><span class="at"> dict</span></span>
-<span id="cb1-1607"><a href="#cb1-1607" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1608"><a href="#cb1-1608" aria-hidden="true" tabindex="-1"></a><span class="co"># The size of the image to resize to. It can be an integer (resized into padded-square</span></span>
-<span id="cb1-1609"><a href="#cb1-1609" aria-hidden="true" tabindex="-1"></a><span class="co"># image) or a tuple (width, height).If not provided, we will attempt to load from</span></span>
-<span id="cb1-1610"><a href="#cb1-1610" aria-hidden="true" tabindex="-1"></a><span class="co"># preprocessor.size, otherwise, images won't be resized.</span></span>
-<span id="cb1-1611"><a href="#cb1-1611" aria-hidden="true" tabindex="-1"></a><span class="fu">image_size</span><span class="kw">:</span><span class="at"> int | tuple[int, int] | None</span></span>
-<span id="cb1-1612"><a href="#cb1-1612" aria-hidden="true" tabindex="-1"></a><span class="co"># The resampling algorithm to use for image resizing. Default is bilinear. Please refer</span></span>
-<span id="cb1-1613"><a href="#cb1-1613" aria-hidden="true" tabindex="-1"></a><span class="co"># to PIL.Image.Resampling for more details.</span></span>
-<span id="cb1-1614"><a href="#cb1-1614" aria-hidden="true" tabindex="-1"></a><span class="fu">image_resize_algorithm</span><span class="kw">:</span><span class="at"> Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None</span></span>
-<span id="cb1-1615"><a href="#cb1-1615" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1616"><a href="#cb1-1616" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the base model configuration</span></span>
-<span id="cb1-1617"><a href="#cb1-1617" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1618"><a href="#cb1-1618" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides the base model loading from_pretrained</span></span>
-<span id="cb1-1619"><a href="#cb1-1619" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1620"><a href="#cb1-1620" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to specify the type of model to load, AutoModelForCausalLM is a good</span></span>
-<span id="cb1-1621"><a href="#cb1-1621" aria-hidden="true" tabindex="-1"></a><span class="co"># choice too</span></span>
-<span id="cb1-1622"><a href="#cb1-1622" aria-hidden="true" tabindex="-1"></a><span class="fu">type_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1623"><a href="#cb1-1623" aria-hidden="true" tabindex="-1"></a><span class="co"># You can specify to choose a specific model revision from huggingface hub</span></span>
-<span id="cb1-1624"><a href="#cb1-1624" aria-hidden="true" tabindex="-1"></a><span class="fu">revision_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1625"><a href="#cb1-1625" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1626"><a href="#cb1-1626" aria-hidden="true" tabindex="-1"></a><span class="fu">max_packed_sequence_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1627"><a href="#cb1-1627" aria-hidden="true" tabindex="-1"></a><span class="fu">rope_scaling</span><span class="kw">:</span><span class="at"> Any | None</span></span>
-<span id="cb1-1628"><a href="#cb1-1628" aria-hidden="true" tabindex="-1"></a><span class="fu">noisy_embedding_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1629"><a href="#cb1-1629" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1630"><a href="#cb1-1630" aria-hidden="true" tabindex="-1"></a><span class="fu">evaluation_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1631"><a href="#cb1-1631" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_table_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1632"><a href="#cb1-1632" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1633"><a href="#cb1-1633" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_use_logits_to_keep</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1634"><a href="#cb1-1634" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_generate_during_eval</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1635"><a href="#cb1-1635" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_norm_loss</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1636"><a href="#cb1-1636" aria-hidden="true" tabindex="-1"></a><span class="fu">rpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-1596"><a href="#cb1-1596" aria-hidden="true" tabindex="-1"></a><span class="fu">use_ray</span><span class="kw">:</span><span class="at"> bool = False</span></span>
+<span id="cb1-1597"><a href="#cb1-1597" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1598"><a href="#cb1-1598" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_num_workers</span><span class="kw">:</span><span class="at"> int = 1</span></span>
+<span id="cb1-1599"><a href="#cb1-1599" aria-hidden="true" tabindex="-1"></a><span class="fu">resources_per_worker</span><span class="kw">:</span><span class="at"> dict</span></span>
+<span id="cb1-1600"><a href="#cb1-1600" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1601"><a href="#cb1-1601" aria-hidden="true" tabindex="-1"></a><span class="co"># The size of the image to resize to. It can be an integer (resized into padded-square</span></span>
+<span id="cb1-1602"><a href="#cb1-1602" aria-hidden="true" tabindex="-1"></a><span class="co"># image) or a tuple (width, height).If not provided, we will attempt to load from</span></span>
+<span id="cb1-1603"><a href="#cb1-1603" aria-hidden="true" tabindex="-1"></a><span class="co"># preprocessor.size, otherwise, images won't be resized.</span></span>
+<span id="cb1-1604"><a href="#cb1-1604" aria-hidden="true" tabindex="-1"></a><span class="fu">image_size</span><span class="kw">:</span><span class="at"> int | tuple[int, int] | None</span></span>
+<span id="cb1-1605"><a href="#cb1-1605" aria-hidden="true" tabindex="-1"></a><span class="co"># The resampling algorithm to use for image resizing. Default is bilinear. Please refer</span></span>
+<span id="cb1-1606"><a href="#cb1-1606" aria-hidden="true" tabindex="-1"></a><span class="co"># to PIL.Image.Resampling for more details.</span></span>
+<span id="cb1-1607"><a href="#cb1-1607" aria-hidden="true" tabindex="-1"></a><span class="fu">image_resize_algorithm</span><span class="kw">:</span><span class="at"> Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None</span></span>
+<span id="cb1-1608"><a href="#cb1-1608" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1609"><a href="#cb1-1609" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the base model configuration</span></span>
+<span id="cb1-1610"><a href="#cb1-1610" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1611"><a href="#cb1-1611" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides the base model loading from_pretrained</span></span>
+<span id="cb1-1612"><a href="#cb1-1612" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1613"><a href="#cb1-1613" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to specify the type of model to load, AutoModelForCausalLM is a good</span></span>
+<span id="cb1-1614"><a href="#cb1-1614" aria-hidden="true" tabindex="-1"></a><span class="co"># choice too</span></span>
+<span id="cb1-1615"><a href="#cb1-1615" aria-hidden="true" tabindex="-1"></a><span class="fu">type_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1616"><a href="#cb1-1616" aria-hidden="true" tabindex="-1"></a><span class="co"># You can specify to choose a specific model revision from huggingface hub</span></span>
+<span id="cb1-1617"><a href="#cb1-1617" aria-hidden="true" tabindex="-1"></a><span class="fu">revision_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1618"><a href="#cb1-1618" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1619"><a href="#cb1-1619" aria-hidden="true" tabindex="-1"></a><span class="fu">max_packed_sequence_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1620"><a href="#cb1-1620" aria-hidden="true" tabindex="-1"></a><span class="fu">rope_scaling</span><span class="kw">:</span><span class="at"> Any | None</span></span>
+<span id="cb1-1621"><a href="#cb1-1621" aria-hidden="true" tabindex="-1"></a><span class="fu">noisy_embedding_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1622"><a href="#cb1-1622" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1623"><a href="#cb1-1623" aria-hidden="true" tabindex="-1"></a><span class="fu">evaluation_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1624"><a href="#cb1-1624" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_table_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1625"><a href="#cb1-1625" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1626"><a href="#cb1-1626" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_use_logits_to_keep</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1627"><a href="#cb1-1627" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_generate_during_eval</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1628"><a href="#cb1-1628" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_norm_loss</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1629"><a href="#cb1-1629" aria-hidden="true" tabindex="-1"></a><span class="fu">rpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 
 
 
diff --git a/docs/custom_integrations.html b/docs/custom_integrations.html
index 231b0a13f..ca3efc1b9 100644
--- a/docs/custom_integrations.html
+++ b/docs/custom_integrations.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/conversation.html b/docs/dataset-formats/conversation.html
index c36f5f978..0ec4f3547 100644
--- a/docs/dataset-formats/conversation.html
+++ b/docs/dataset-formats/conversation.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/index.html b/docs/dataset-formats/index.html
index 6d6778ced..6c8f86e2d 100644
--- a/docs/dataset-formats/index.html
+++ b/docs/dataset-formats/index.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/inst_tune.html b/docs/dataset-formats/inst_tune.html
index ac2b8680b..1f6ef125c 100644
--- a/docs/dataset-formats/inst_tune.html
+++ b/docs/dataset-formats/inst_tune.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/pretraining.html b/docs/dataset-formats/pretraining.html
index bd56394a5..4ee44dee9 100644
--- a/docs/dataset-formats/pretraining.html
+++ b/docs/dataset-formats/pretraining.html
@@ -662,12 +662,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/stepwise_supervised.html b/docs/dataset-formats/stepwise_supervised.html
index 6ed98af11..e1bec5050 100644
--- a/docs/dataset-formats/stepwise_supervised.html
+++ b/docs/dataset-formats/stepwise_supervised.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/template_free.html b/docs/dataset-formats/template_free.html
index 007288060..bc3630886 100644
--- a/docs/dataset-formats/template_free.html
+++ b/docs/dataset-formats/template_free.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/tokenized.html b/docs/dataset-formats/tokenized.html
index 6869457a6..759c2879d 100644
--- a/docs/dataset-formats/tokenized.html
+++ b/docs/dataset-formats/tokenized.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset_loading.html b/docs/dataset_loading.html
index 7cf8cd263..3a385237b 100644
--- a/docs/dataset_loading.html
+++ b/docs/dataset_loading.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset_preprocessing.html b/docs/dataset_preprocessing.html
index 8e42249c4..68b327f98 100644
--- a/docs/dataset_preprocessing.html
+++ b/docs/dataset_preprocessing.html
@@ -662,12 +662,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/debugging.html b/docs/debugging.html
index 49a259eff..35530faf5 100644
--- a/docs/debugging.html
+++ b/docs/debugging.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -919,8 +913,9 @@ If you prefer to watch a video, rather than read, you can skip to the <a href="#
 <section id="setup" class="level3">
 <h3 class="anchored" data-anchor-id="setup">Setup</h3>
 <p>Make sure you have an <a href="https://setuptools.pypa.io/en/latest/userguide/development_mode.html">editable install</a> of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn,deepspeed]'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">UV_TORCH_BACKEND</span><span class="op">=</span>cu128  <span class="co"># or cu130</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> sync <span class="at">--extra</span> flash-attn <span class="at">--extra</span> deepspeed <span class="at">--group</span> dev <span class="at">--group</span> test</span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> .venv/bin/activate</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <section id="remote-hosts" class="level4">
 <h4 class="anchored" data-anchor-id="remote-hosts">Remote Hosts</h4>
 <p>If you developing on a remote host, you can easily use VSCode to debug remotely. To do so, you will need to follow this <a href="https://code.visualstudio.com/docs/remote/ssh">remote - SSH guide</a>. You can also see the video below on <a href="#video---attaching-to-docker-on-remote-host">Docker and Remote SSH debugging</a>.</p>
@@ -1034,14 +1029,14 @@ You may not want to delete these folders. For example, if you are debugging mode
 If you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project.</p>
 </blockquote>
 <p>Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a></p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="ex">docker</span> run <span class="at">--privileged</span> <span class="at">--gpus</span> <span class="st">'"all"'</span> <span class="at">--shm-size</span> 10g <span class="at">--rm</span> <span class="at">-it</span> <span class="at">--name</span> axolotl <span class="at">--ipc</span><span class="op">=</span>host <span class="at">--ulimit</span> memlock=-1 <span class="at">--ulimit</span> stack=67108864 <span class="at">--mount</span> type=bind,src=<span class="st">"</span><span class="va">${PWD}</span><span class="st">"</span>,target=/workspace/axolotl <span class="at">-v</span> <span class="va">${HOME}</span>/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="ex">docker</span> run <span class="at">--privileged</span> <span class="at">--gpus</span> <span class="st">'"all"'</span> <span class="at">--shm-size</span> 10g <span class="at">--rm</span> <span class="at">-it</span> <span class="at">--name</span> axolotl <span class="at">--ipc</span><span class="op">=</span>host <span class="at">--ulimit</span> memlock=-1 <span class="at">--ulimit</span> stack=67108864 <span class="at">--mount</span> type=bind,src=<span class="st">"</span><span class="va">${PWD}</span><span class="st">"</span>,target=/workspace/axolotl <span class="at">-v</span> <span class="va">${HOME}</span>/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl-uv:main-latest</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <blockquote class="blockquote">
 <p>[!Tip]
 To understand which containers are available, see the <a href="../README.md#docker">Docker section of the README</a> and the <a href="https://hub.docker.com/r/axolotlai/axolotl/tags">DockerHub repo</a>. For details of how the Docker containers are built, see axolotl’s <a href="../.github/workflows/main.yml">Docker CI builds</a>.</p>
 </blockquote>
-<p>You will now be in the container. Next, perform an editable install of Axolotl:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb8"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging</span>
-<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn,deepspeed]'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<p>You will now be in the container. Next, install Axolotl with dev dependencies:</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb8"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> sync <span class="at">--extra</span> flash-attn <span class="at">--extra</span> deepspeed <span class="at">--group</span> dev <span class="at">--group</span> test</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> .venv/bin/activate</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
 <section id="attach-to-container" class="level3">
 <h3 class="anchored" data-anchor-id="attach-to-container">Attach To Container</h3>
diff --git a/docs/docker.html b/docs/docker.html
index 62001ed89..3c5300f06 100644
--- a/docs/docker.html
+++ b/docs/docker.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -833,7 +827,8 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 </header>
 
 
-<p>This section describes the different Docker images that are released by AxolotlAI at <a href="https://hub.docker.com/u/axolotlai">Docker Hub</a>.</p>
+<p>This section describes the different Docker images that are released by AxolotlAI at
+<a href="https://hub.docker.com/u/axolotlai">Docker Hub</a>.</p>
 <div class="callout callout-style-default callout-important callout-titled">
 <div class="callout-header d-flex align-content-center">
 <div class="callout-icon-container">
@@ -844,24 +839,61 @@ Important
 </div>
 </div>
 <div class="callout-body-container callout-body">
-<p>For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.</p>
+<p>For Blackwell GPUs, please use the tags with PyTorch 2.9.1 and CUDA 12.8.</p>
+</div>
+</div>
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Each image below is available in a <strong>uv variant</strong> that uses <a href="https://docs.astral.sh/uv/">uv</a> with
+a relocatable venv (<code>/workspace/axolotl-venv</code>) instead of Miniconda + pip. Append <code>-uv</code> to the image name
+(e.g.&nbsp;<code>axolotlai/axolotl-base-uv</code>). Tags follow the same format. We recommend the uv images for new deployments.</p>
 </div>
 </div>
 <section id="base" class="level2">
 <h2 class="anchored" data-anchor-id="base">Base</h2>
-<p>The base image is the most minimal image that can install Axolotl. It is based on the <code>nvidia/cuda</code> image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.</p>
+<p>The base image is the most minimal image that can install Axolotl. It is based on the <code>nvidia/cuda</code> image.
+It includes python, torch, git, git-lfs, awscli, pydantic, and more.</p>
 <section id="image" class="level4">
 <h4 class="anchored" data-anchor-id="image">Image</h4>
-<pre><code>axolotlai/axolotl-base</code></pre>
-<p>Link: <a href="https://hub.docker.com/r/axolotlai/axolotl-base">Docker Hub</a></p>
+<table class="caption-top table">
+<thead>
+<tr class="header">
+<th>Variant</th>
+<th>Image</th>
+<th>Docker Hub</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>pip</td>
+<td><code>axolotlai/axolotl-base</code></td>
+<td><a href="https://hub.docker.com/r/axolotlai/axolotl-base">Link</a></td>
+</tr>
+<tr class="even">
+<td>uv</td>
+<td><code>axolotlai/axolotl-base-uv</code></td>
+<td><a href="https://hub.docker.com/r/axolotlai/axolotl-base-uv">Link</a></td>
+</tr>
+</tbody>
+</table>
 </section>
 <section id="tags-format" class="level4">
 <h4 class="anchored" data-anchor-id="tags-format">Tags format</h4>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">main-base-py{python_version}-cu{cuda_version}-{pytorch_version}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="ex">main-base-py{python_version}-cu{cuda_version}-{pytorch_version}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <p>Tags examples:</p>
 <ul>
-<li><code>main-base-py3.11-cu128-2.8.0</code></li>
 <li><code>main-base-py3.11-cu128-2.9.1</code></li>
+<li><code>main-base-py3.12-cu128-2.10.0</code></li>
+<li><code>main-base-py3.12-cu130-2.9.1</code></li>
+<li><code>main-base-py3.12-cu130-2.10.0</code></li>
 </ul>
 </section>
 </section>
@@ -870,22 +902,41 @@ Important
 <p>The main image is the image that is used to run Axolotl. It is based on the <code>axolotlai/axolotl-base</code> image and includes the Axolotl codebase, dependencies, and more.</p>
 <section id="image-1" class="level4">
 <h4 class="anchored" data-anchor-id="image-1">Image</h4>
-<pre><code>axolotlai/axolotl</code></pre>
-<p>Link: <a href="https://hub.docker.com/r/axolotlai/axolotl">Docker Hub</a></p>
+<table class="caption-top table">
+<thead>
+<tr class="header">
+<th>Variant</th>
+<th>Image</th>
+<th>Docker Hub</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>pip</td>
+<td><code>axolotlai/axolotl</code></td>
+<td><a href="https://hub.docker.com/r/axolotlai/axolotl">Link</a></td>
+</tr>
+<tr class="even">
+<td>uv</td>
+<td><code>axolotlai/axolotl-uv</code></td>
+<td><a href="https://hub.docker.com/r/axolotlai/axolotl-uv">Link</a></td>
+</tr>
+</tbody>
+</table>
 </section>
 <section id="sec-main-tags" class="level4">
 <h4 class="anchored" data-anchor-id="sec-main-tags">Tags format</h4>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># on push to main</span></span>
-<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="ex">main-py{python_version}-cu{cuda_version}-{pytorch_version}</span></span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co"># latest main (currently torch 2.6.0, python 3.11, cuda 12.4)</span></span>
-<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="ex">main-latest</span></span>
-<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a><span class="co"># nightly build</span></span>
-<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a><span class="ex">{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}</span></span>
-<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a><span class="co"># tagged release</span></span>
-<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a><span class="ex">{version}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># on push to main</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="ex">main-py{python_version}-cu{cuda_version}-{pytorch_version}</span></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co"># latest main (currently torch 2.9.1, python 3.11, cuda 12.8)</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="ex">main-latest</span></span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="co"># nightly build</span></span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a><span class="ex">{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}</span></span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a><span class="co"># tagged release</span></span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a><span class="ex">{version}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <div class="callout callout-style-default callout-tip callout-titled">
 <div class="callout-header d-flex align-content-center">
 <div class="callout-icon-container">
@@ -901,11 +952,12 @@ Tip
 </div>
 <p>Tags examples:</p>
 <ul>
-<li><code>main-py3.11-cu128-2.8.0</code></li>
 <li><code>main-py3.11-cu128-2.9.1</code></li>
+<li><code>main-py3.12-cu128-2.10.0</code></li>
+<li><code>main-py3.12-cu130-2.9.1</code></li>
+<li><code>main-py3.12-cu130-2.10.0</code></li>
 <li><code>main-latest</code></li>
-<li><code>main-20250303-py3.11-cu124-2.6.0</code></li>
-<li><code>main-20250303-py3.11-cu126-2.6.0</code></li>
+<li><code>main-20260315-py3.11-cu128-2.9.1</code></li>
 <li><code>0.12.0</code></li>
 </ul>
 </section>
@@ -928,8 +980,27 @@ Tip
 </div>
 <section id="image-2" class="level4">
 <h4 class="anchored" data-anchor-id="image-2">Image</h4>
-<pre><code>axolotlai/axolotl-cloud</code></pre>
-<p>Link: <a href="https://hub.docker.com/r/axolotlai/axolotl-cloud">Docker Hub</a></p>
+<table class="caption-top table">
+<thead>
+<tr class="header">
+<th>Variant</th>
+<th>Image</th>
+<th>Docker Hub</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>pip</td>
+<td><code>axolotlai/axolotl-cloud</code></td>
+<td><a href="https://hub.docker.com/r/axolotlai/axolotl-cloud">Link</a></td>
+</tr>
+<tr class="even">
+<td>uv</td>
+<td><code>axolotlai/axolotl-cloud-uv</code></td>
+<td><a href="https://hub.docker.com/r/axolotlai/axolotl-cloud-uv">Link</a></td>
+</tr>
+</tbody>
+</table>
 </section>
 <section id="tags-format-1" class="level4">
 <h4 class="anchored" data-anchor-id="tags-format-1">Tags format</h4>
diff --git a/docs/ebft.html b/docs/ebft.html
index a49150984..123cd3f92 100644
--- a/docs/ebft.html
+++ b/docs/ebft.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/expert_quantization.html b/docs/expert_quantization.html
index c5d98d371..a56b3742e 100644
--- a/docs/expert_quantization.html
+++ b/docs/expert_quantization.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/faq.html b/docs/faq.html
index 777eaa3d4..9cf9937ac 100644
--- a/docs/faq.html
+++ b/docs/faq.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/fsdp_qlora.html b/docs/fsdp_qlora.html
index 1118c1e42..c1d9c76fc 100644
--- a/docs/fsdp_qlora.html
+++ b/docs/fsdp_qlora.html
@@ -662,12 +662,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link active">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/getting-started.html b/docs/getting-started.html
index 12af41f46..6ab6e0462 100644
--- a/docs/getting-started.html
+++ b/docs/getting-started.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/gradient_checkpointing.html b/docs/gradient_checkpointing.html
index e6678e8b8..c106e4d57 100644
--- a/docs/gradient_checkpointing.html
+++ b/docs/gradient_checkpointing.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/grpo.html b/docs/grpo.html
index 80a3b8db6..b184c268d 100644
--- a/docs/grpo.html
+++ b/docs/grpo.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/inference.html b/docs/inference.html
index ec3e9a3de..e8770b479 100644
--- a/docs/inference.html
+++ b/docs/inference.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/input_output.html b/docs/input_output.html
index 0961b3b39..036972c34 100644
--- a/docs/input_output.html
+++ b/docs/input_output.html
@@ -662,12 +662,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/installation.html b/docs/installation.html
index 04c3dd8bb..6aebb81ae 100644
--- a/docs/installation.html
+++ b/docs/installation.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -787,12 +781,11 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
    
   <ul>
   <li><a href="#sec-requirements" id="toc-sec-requirements" class="nav-link active" data-scroll-target="#sec-requirements"><span class="header-section-number">1</span> Requirements</a></li>
-  <li><a href="#sec-installation-methods" id="toc-sec-installation-methods" class="nav-link" data-scroll-target="#sec-installation-methods"><span class="header-section-number">2</span> Installation Methods</a>
+  <li><a href="#sec-installation" id="toc-sec-installation" class="nav-link" data-scroll-target="#sec-installation"><span class="header-section-number">2</span> Installation</a>
   <ul class="collapse">
-  <li><a href="#sec-pypi" id="toc-sec-pypi" class="nav-link" data-scroll-target="#sec-pypi"><span class="header-section-number">2.1</span> PyPI Installation (Recommended)</a></li>
-  <li><a href="#sec-uv" id="toc-sec-uv" class="nav-link" data-scroll-target="#sec-uv"><span class="header-section-number">2.2</span> uv Installation</a></li>
-  <li><a href="#sec-edge-build" id="toc-sec-edge-build" class="nav-link" data-scroll-target="#sec-edge-build"><span class="header-section-number">2.3</span> Edge/Development Build</a></li>
-  <li><a href="#sec-docker" id="toc-sec-docker" class="nav-link" data-scroll-target="#sec-docker"><span class="header-section-number">2.4</span> Docker</a></li>
+  <li><a href="#sec-uv" id="toc-sec-uv" class="nav-link" data-scroll-target="#sec-uv"><span class="header-section-number">2.1</span> Quick Install</a></li>
+  <li><a href="#sec-edge-build" id="toc-sec-edge-build" class="nav-link" data-scroll-target="#sec-edge-build"><span class="header-section-number">2.2</span> Edge/Development Build</a></li>
+  <li><a href="#sec-docker" id="toc-sec-docker" class="nav-link" data-scroll-target="#sec-docker"><span class="header-section-number">2.3</span> Docker</a></li>
   </ul></li>
   <li><a href="#sec-cloud" id="toc-sec-cloud" class="nav-link" data-scroll-target="#sec-cloud"><span class="header-section-number">3</span> Cloud Environments</a>
   <ul class="collapse">
@@ -804,11 +797,9 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <li><a href="#sec-macos" id="toc-sec-macos" class="nav-link" data-scroll-target="#sec-macos"><span class="header-section-number">4.1</span> macOS</a></li>
   <li><a href="#sec-windows" id="toc-sec-windows" class="nav-link" data-scroll-target="#sec-windows"><span class="header-section-number">4.2</span> Windows</a></li>
   </ul></li>
-  <li><a href="#sec-env-managers" id="toc-sec-env-managers" class="nav-link" data-scroll-target="#sec-env-managers"><span class="header-section-number">5</span> Environment Managers</a>
-  <ul class="collapse">
-  <li><a href="#sec-conda" id="toc-sec-conda" class="nav-link" data-scroll-target="#sec-conda"><span class="header-section-number">5.1</span> Conda/Pip venv</a></li>
-  </ul></li>
-  <li><a href="#sec-troubleshooting" id="toc-sec-troubleshooting" class="nav-link" data-scroll-target="#sec-troubleshooting"><span class="header-section-number">6</span> Troubleshooting</a></li>
+  <li><a href="#sec-migrating" id="toc-sec-migrating" class="nav-link" data-scroll-target="#sec-migrating"><span class="header-section-number">5</span> Migrating from pip to uv</a></li>
+  <li><a href="#sec-pip" id="toc-sec-pip" class="nav-link" data-scroll-target="#sec-pip"><span class="header-section-number">6</span> Using pip (Alternative)</a></li>
+  <li><a href="#sec-troubleshooting" id="toc-sec-troubleshooting" class="nav-link" data-scroll-target="#sec-troubleshooting"><span class="header-section-number">7</span> Troubleshooting</a></li>
   </ul>
 </nav>
     </div>
@@ -840,25 +831,11 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <ul>
 <li>NVIDIA GPU (Ampere architecture or newer for <code>bf16</code> and Flash Attention) or AMD GPU</li>
 <li>Python ≥3.11</li>
-<li>PyTorch ≥2.6.0</li>
+<li>PyTorch ≥2.9.0</li>
 </ul>
 </section>
-<section id="sec-installation-methods" class="level2" data-number="2">
-<h2 data-number="2" class="anchored" data-anchor-id="sec-installation-methods"><span class="header-section-number">2</span> Installation Methods</h2>
-<div class="callout callout-style-default callout-important callout-titled">
-<div class="callout-header d-flex align-content-center">
-<div class="callout-icon-container">
-<i class="callout-icon"></i>
-</div>
-<div class="callout-title-container flex-fill">
-Important
-</div>
-</div>
-<div class="callout-body-container callout-body">
-<p>Please make sure to have Pytorch installed before installing Axolotl in your local environment.</p>
-<p>Follow the instructions at: <a href="https://pytorch.org/get-started/locally/">https://pytorch.org/get-started/locally/</a></p>
-</div>
-</div>
+<section id="sec-installation" class="level2" data-number="2">
+<h2 data-number="2" class="anchored" data-anchor-id="sec-installation"><span class="header-section-number">2</span> Installation</h2>
 <div class="callout callout-style-default callout-important callout-titled">
 <div class="callout-header d-flex align-content-center">
 <div class="callout-icon-container">
@@ -872,50 +849,33 @@ Important
 <p>For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.</p>
 </div>
 </div>
-<section id="sec-pypi" class="level3" data-number="2.1">
-<h3 data-number="2.1" class="anchored" data-anchor-id="sec-pypi"><span class="header-section-number">2.1</span> PyPI Installation (Recommended)</h3>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">-U</span> packaging setuptools wheel ninja</span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> axolotl<span class="pp">[</span><span class="ss">flash</span><span class="pp">-</span><span class="ss">attn,deepspeed</span><span class="pp">]</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-<p>We use <code>--no-build-isolation</code> in order to detect the installed PyTorch version (if
-installed) in order not to clobber it, and so that we set the correct version of
-dependencies that are specific to the PyTorch version or other installed
-co-dependencies.</p>
+<section id="sec-uv" class="level3" data-number="2.1">
+<h3 data-number="2.1" class="anchored" data-anchor-id="sec-uv"><span class="header-section-number">2.1</span> Quick Install</h3>
+<p>Axolotl uses <a href="https://docs.astral.sh/uv/">uv</a> as its package manager. uv is a fast, reliable Python package installer and resolver built in Rust.</p>
+<p>Install uv if not already installed:</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="ex">curl</span> <span class="at">-LsSf</span> https://astral.sh/uv/install.sh <span class="kw">|</span> <span class="fu">sh</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> <span class="va">$HOME</span>/.local/bin/env</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<p>Choose your CUDA version (e.g.&nbsp;<code>cu128</code>, <code>cu130</code>), create a venv, and install:</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">UV_TORCH_BACKEND</span><span class="op">=</span>cu128  <span class="co"># or cu130</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> venv <span class="at">--no-project</span> <span class="at">--relocatable</span></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> .venv/bin/activate</span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> axolotl<span class="pp">[</span><span class="ss">flash</span><span class="pp">-</span><span class="ss">attn,deepspeed</span><span class="pp">]</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
-<section id="sec-uv" class="level3" data-number="2.2">
-<h3 data-number="2.2" class="anchored" data-anchor-id="sec-uv"><span class="header-section-number">2.2</span> uv Installation</h3>
-<p>uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.</p>
-<p>Install uv if not already installed</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">curl</span> <span class="at">-LsSf</span> https://astral.sh/uv/install.sh <span class="kw">|</span> <span class="fu">sh</span></span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> <span class="va">$HOME</span>/.local/bin/env</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-<p>Choose your CUDA version to use with PyTorch; e.g.&nbsp;<code>cu124</code>, <code>cu126</code>, <code>cu128</code>,
-then create the venv and activate</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">UV_TORCH_BACKEND</span><span class="op">=</span>cu126</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> venv <span class="at">--no-project</span> <span class="at">--relocatable</span></span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> .venv/bin/activate</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-<p>Install PyTorch
-- PyTorch 2.6.0 recommended</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install packaging setuptools wheel</span>
-<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install torch==2.6.0</span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install awscli pydantic</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-<p>Install axolotl from PyPi</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> axolotl<span class="pp">[</span><span class="ss">deepspeed,flash</span><span class="pp">-</span><span class="ss">attn</span><span class="pp">]</span></span>
-<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="co"># optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO</span></span>
-<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> axolotl<span class="pp">[</span><span class="ss">deepspeed,flash</span><span class="pp">-</span><span class="ss">attn,vllm</span><span class="pp">]</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-</section>
-<section id="sec-edge-build" class="level3" data-number="2.3">
-<h3 data-number="2.3" class="anchored" data-anchor-id="sec-edge-build"><span class="header-section-number">2.3</span> Edge/Development Build</h3>
+<section id="sec-edge-build" class="level3" data-number="2.2">
+<h3 data-number="2.2" class="anchored" data-anchor-id="sec-edge-build"><span class="header-section-number">2.2</span> Edge/Development Build</h3>
 <p>For the latest features between releases:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb6"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> clone https://github.com/axolotl-ai-cloud/axolotl.git</span>
-<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> axolotl</span>
-<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">-U</span> packaging setuptools wheel ninja</span>
-<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn,deepspeed]'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> clone https://github.com/axolotl-ai-cloud/axolotl.git</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> axolotl</span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">UV_TORCH_BACKEND</span><span class="op">=</span>cu128  <span class="co"># or cu130</span></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> sync <span class="at">--extra</span> flash-attn <span class="at">--extra</span> deepspeed</span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> .venv/bin/activate</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<p><code>uv sync</code> creates a <code>.venv</code>, installs exact pinned versions from <code>uv.lock</code>, and sets up an editable install automatically.</p>
 </section>
-<section id="sec-docker" class="level3" data-number="2.4">
-<h3 data-number="2.4" class="anchored" data-anchor-id="sec-docker"><span class="header-section-number">2.4</span> Docker</h3>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="ex">docker</span> run <span class="at">--gpus</span> <span class="st">'"all"'</span> <span class="at">--rm</span> <span class="at">-it</span> axolotlai/axolotl:main-latest</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<section id="sec-docker" class="level3" data-number="2.3">
+<h3 data-number="2.3" class="anchored" data-anchor-id="sec-docker"><span class="header-section-number">2.3</span> Docker</h3>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="ex">docker</span> run <span class="at">--gpus</span> <span class="st">'"all"'</span> <span class="at">--rm</span> <span class="at">-it</span> <span class="at">--ipc</span><span class="op">=</span>host axolotlai/axolotl-uv:main-latest</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <p>For development with Docker:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb8"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="ex">docker</span> compose up <span class="at">-d</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="ex">docker</span> compose up <span class="at">-d</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <div class="callout callout-style-default callout-tip callout-titled">
 <div class="callout-header d-flex align-content-center">
 <div class="callout-icon-container">
@@ -926,12 +886,12 @@ then create the venv and activate</p>
 </div>
 </div>
 <div class="callout-body-container callout-body">
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb9"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="ex">docker</span> run <span class="at">--privileged</span> <span class="at">--gpus</span> <span class="st">'"all"'</span> <span class="at">--shm-size</span> 10g <span class="at">--rm</span> <span class="at">-it</span> <span class="dt">\</span></span>
-<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>  <span class="at">--name</span> axolotl <span class="at">--ipc</span><span class="op">=</span>host <span class="dt">\</span></span>
-<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>  <span class="at">--ulimit</span> memlock=-1 <span class="at">--ulimit</span> stack=67108864 <span class="dt">\</span></span>
-<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>  <span class="at">--mount</span> type=bind,src=<span class="st">"</span><span class="va">${PWD}</span><span class="st">"</span>,target=/workspace/axolotl <span class="dt">\</span></span>
-<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>  <span class="at">-v</span> <span class="va">${HOME}</span>/.cache/huggingface:/root/.cache/huggingface <span class="dt">\</span></span>
-<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>  axolotlai/axolotl:main-latest</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb6"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="ex">docker</span> run <span class="at">--privileged</span> <span class="at">--gpus</span> <span class="st">'"all"'</span> <span class="at">--shm-size</span> 10g <span class="at">--rm</span> <span class="at">-it</span> <span class="dt">\</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>  <span class="at">--name</span> axolotl <span class="at">--ipc</span><span class="op">=</span>host <span class="dt">\</span></span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>  <span class="at">--ulimit</span> memlock=-1 <span class="at">--ulimit</span> stack=67108864 <span class="dt">\</span></span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>  <span class="at">--mount</span> type=bind,src=<span class="st">"</span><span class="va">${PWD}</span><span class="st">"</span>,target=/workspace/axolotl <span class="dt">\</span></span>
+<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>  <span class="at">-v</span> <span class="va">${HOME}</span>/.cache/huggingface:/root/.cache/huggingface <span class="dt">\</span></span>
+<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>  axolotlai/axolotl-uv:main-latest</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </div>
 </div>
 <div class="callout callout-style-default callout-important callout-titled">
@@ -944,7 +904,7 @@ Important
 </div>
 </div>
 <div class="callout-body-container callout-body">
-<p>For Blackwell GPUs, please use <code>axolotlai/axolotl:main-py3.11-cu128-2.9.1</code> or the cloud variant <code>axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1</code>.</p>
+<p>For Blackwell GPUs, please use <code>axolotlai/axolotl-uv:main-py3.11-cu128-2.9.1</code> or the cloud variant <code>axolotlai/axolotl-cloud-uv:main-py3.11-cu128-2.9.1</code>.</p>
 </div>
 </div>
 <p>Please refer to the <a href="../docs/docker.html">Docker documentation</a> for more information on the different Docker images that are available.</p>
@@ -956,7 +916,7 @@ Important
 <h3 data-number="3.1" class="anchored" data-anchor-id="sec-cloud-gpu"><span class="header-section-number">3.1</span> Cloud GPU Providers</h3>
 <p>For providers supporting Docker:</p>
 <ul>
-<li>Use <code>axolotlai/axolotl-cloud:main-latest</code></li>
+<li>Use <code>axolotlai/axolotl-cloud-uv:main-latest</code></li>
 <li>Available on:
 <ul>
 <li><a href="https://runpod.io/gsc?template=v2ickqhz9s&amp;ref=6i7fkpdz">RunPod</a></li>
@@ -978,8 +938,8 @@ Important
 <h2 data-number="4" class="anchored" data-anchor-id="sec-platform-specific"><span class="header-section-number">4</span> Platform-Specific Instructions</h2>
 <section id="sec-macos" class="level3" data-number="4.1">
 <h3 data-number="4.1" class="anchored" data-anchor-id="sec-macos"><span class="header-section-number">4.1</span> macOS</h3>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb10"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-<p>See <a href="#sec-troubleshooting" class="quarto-xref">Section&nbsp;6</a> for Mac-specific issues.</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<p>See <a href="#sec-troubleshooting" class="quarto-xref">Section&nbsp;7</a> for Mac-specific issues.</p>
 </section>
 <section id="sec-windows" class="level3" data-number="4.2">
 <h3 data-number="4.2" class="anchored" data-anchor-id="sec-windows"><span class="header-section-number">4.2</span> Windows</h3>
@@ -998,23 +958,46 @@ Important
 </div>
 </section>
 </section>
-<section id="sec-env-managers" class="level2" data-number="5">
-<h2 data-number="5" class="anchored" data-anchor-id="sec-env-managers"><span class="header-section-number">5</span> Environment Managers</h2>
-<section id="sec-conda" class="level3" data-number="5.1">
-<h3 data-number="5.1" class="anchored" data-anchor-id="sec-conda"><span class="header-section-number">5.1</span> Conda/Pip venv</h3>
-<ol type="1">
-<li><p>Install Python ≥3.11</p></li>
-<li><p>Install PyTorch: https://pytorch.org/get-started/locally/</p></li>
-<li><p>Install Axolotl:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb11"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">-U</span> packaging setuptools wheel ninja</span>
-<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn,deepspeed]'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
-<li><p>(Optional) Login to Hugging Face:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb12"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="ex">hf</span> auth login</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
-</ol>
+<section id="sec-migrating" class="level2" data-number="5">
+<h2 data-number="5" class="anchored" data-anchor-id="sec-migrating"><span class="header-section-number">5</span> Migrating from pip to uv</h2>
+<p>If you have an existing pip-based Axolotl installation, you can migrate to uv:</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb8"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Install uv</span></span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="ex">curl</span> <span class="at">-LsSf</span> https://astral.sh/uv/install.sh <span class="kw">|</span> <span class="fu">sh</span></span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> <span class="va">$HOME</span>/.local/bin/env</span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a fresh venv (recommended for a clean start)</span></span>
+<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">UV_TORCH_BACKEND</span><span class="op">=</span>cu128  <span class="co"># or cu130</span></span>
+<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> venv <span class="at">--no-project</span> <span class="at">--relocatable</span></span>
+<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> .venv/bin/activate</span>
+<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Reinstall axolotl</span></span>
+<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> axolotl<span class="pp">[</span><span class="ss">flash</span><span class="pp">-</span><span class="ss">attn,deepspeed</span><span class="pp">]</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
+<section id="sec-pip" class="level2" data-number="6">
+<h2 data-number="6" class="anchored" data-anchor-id="sec-pip"><span class="header-section-number">6</span> Using pip (Alternative)</h2>
+<p>If you are unable to install uv, you can still use pip directly.</p>
+<div class="callout callout-style-default callout-important callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Important
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Please make sure to have PyTorch installed before installing Axolotl with pip.</p>
+<p>Follow the instructions at: <a href="https://pytorch.org/get-started/locally/">https://pytorch.org/get-started/locally/</a></p>
+</div>
+</div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb9"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">-U</span> packaging setuptools wheel ninja</span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> axolotl<span class="pp">[</span><span class="ss">flash</span><span class="pp">-</span><span class="ss">attn,deepspeed</span><span class="pp">]</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<p>For editable/development installs:</p>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb10"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">-U</span> packaging setuptools wheel ninja</span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn,deepspeed]'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
-<section id="sec-troubleshooting" class="level2" data-number="6">
-<h2 data-number="6" class="anchored" data-anchor-id="sec-troubleshooting"><span class="header-section-number">6</span> Troubleshooting</h2>
+<section id="sec-troubleshooting" class="level2" data-number="7">
+<h2 data-number="7" class="anchored" data-anchor-id="sec-troubleshooting"><span class="header-section-number">7</span> Troubleshooting</h2>
 <p>If you encounter installation issues, see our <a href="../docs/faq.html">FAQ</a> and <a href="../docs/debugging.html">Debugging Guide</a>.</p>
 
 
diff --git a/docs/lora_optims.html b/docs/lora_optims.html
index 1ef931140..a4f102e54 100644
--- a/docs/lora_optims.html
+++ b/docs/lora_optims.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/lr_groups.html b/docs/lr_groups.html
index d45802a34..81a41bbf9 100644
--- a/docs/lr_groups.html
+++ b/docs/lr_groups.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/mac.html b/docs/mac.html
index 6ef3463f9..cc873c65e 100644
--- a/docs/mac.html
+++ b/docs/mac.html
@@ -662,12 +662,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/mixed_precision.html b/docs/mixed_precision.html
index 8740e7f7b..6576b0f0a 100644
--- a/docs/mixed_precision.html
+++ b/docs/mixed_precision.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/LiquidAI.html b/docs/models/LiquidAI.html
index f3ea1bc9c..455326c92 100644
--- a/docs/models/LiquidAI.html
+++ b/docs/models/LiquidAI.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -827,8 +821,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <li><p>Install Axolotl following the <a href="https://docs.axolotl.ai/docs/installation.html">installation guide</a>.</p>
 <p>Here is an example of how to install from pip:</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ensure you have a compatible version of Pytorch installed</span></span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging setuptools wheel ninja</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 <li><p>Run one of the finetuning examples below.</p>
 <p><strong>LFM2</strong></p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># FFT SFT (1x48GB @ 25GiB)</span></span>
@@ -837,7 +830,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># LoRA SFT (1x48GB @ 2.7GiB)</span></span>
 <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train examples/LiquidAI/lfm2-vl-lora.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <p><strong>LFM2-MoE</strong></p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip</span> install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6</span>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6</span>
 <span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="co"># LoRA SFT (1x48GB @ 16.2GiB)</span></span>
 <span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train examples/LiquidAI/lfm2-8b-a1b-lora.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
@@ -846,7 +839,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <h3 class="anchored" data-anchor-id="tips">TIPS</h3>
 <ul>
 <li><p><strong>Installation Error</strong>: If you encounter <code>ImportError: ... undefined symbol ...</code> or <code>ModuleNotFoundError: No module named 'causal_conv1d_cuda'</code>, the <code>causal-conv1d</code> package may have been installed incorrectly. Try uninstalling it:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip</span> uninstall <span class="at">-y</span> causal-conv1d</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip uninstall causal-conv1d</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 <li><p><strong>Dataset Loading</strong>: Read more on how to load your own dataset in our <a href="https://docs.axolotl.ai/docs/dataset_loading.html">documentation</a>.</p></li>
 <li><p><strong>Dataset Formats</strong>:</p>
 <ul>
diff --git a/docs/models/apertus.html b/docs/models/apertus.html
index f16a01f4a..3b61f46d6 100644
--- a/docs/models/apertus.html
+++ b/docs/models/apertus.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -830,11 +824,10 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> clone https://github.com/axolotl-ai-cloud/axolotl.git</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> axolotl</span>
 <span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging==26.0 setuptools==75.8.0 wheel ninja</span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn]'</span></span>
-<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy</span></span>
-<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/cutcrossentropy_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn]'</span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/cutcrossentropy_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <ol start="2" type="1">
 <li>(Optional, highly recommended) Install XIELU CUDA</li>
 </ol>
@@ -844,7 +837,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co"># For those using our Docker image, use the below path.</span></span>
 <span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">CUDA_HOME</span><span class="op">=</span>/usr/local/cuda</span>
 <span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install git+https://github.com/nickjbrowning/XIELU@59d6031 <span class="at">--no-build-isolation</span> <span class="at">--no-deps</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install git+https://github.com/nickjbrowning/XIELU@59d6031 <span class="at">--no-build-isolation</span> <span class="at">--no-deps</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <p>For any installation errors, see <a href="#xielu-installation-issues">XIELU Installation Issues</a></p>
 <ol start="3" type="1">
 <li>Run the finetuning example:</li>
@@ -872,7 +865,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <p>If those didn’t help, please try the below solutions:</p>
 <ol type="1">
 <li><p>Pass env for CMAKE and try install again:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="va">Python_EXECUTABLE</span><span class="op">=</span><span class="va">$(</span><span class="fu">which</span> python<span class="va">)</span> <span class="ex">pip3</span> install git+https://github.com/nickjbrowning/XIELU@59d6031 <span class="at">--no-build-isolation</span> <span class="at">--no-deps</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="va">Python_EXECUTABLE</span><span class="op">=</span><span class="va">$(</span><span class="fu">which</span> python<span class="va">)</span> <span class="ex">uv</span> pip install git+https://github.com/nickjbrowning/XIELU@59d6031 <span class="at">--no-build-isolation</span> <span class="at">--no-deps</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 <li><p>Git clone the repo and manually hardcode python path:</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> clone https://github.com/nickjbrowning/XIELU</span>
 <span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> xielu</span>
@@ -887,7 +880,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>    OUTPUT_VARIABLE TORCH_CMAKE_PATH_OUTPUT</span>
 <span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>    ERROR_VARIABLE TORCH_CMAKE_PATH_ERROR</span>
 <span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>)</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install . <span class="at">--no-build-isolation</span> <span class="at">--no-deps</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install . <span class="at">--no-build-isolation</span> <span class="at">--no-deps</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 </ol>
 </section>
 </section>
diff --git a/docs/models/arcee.html b/docs/models/arcee.html
index e4c59a52a..cc6a4e0ab 100644
--- a/docs/models/arcee.html
+++ b/docs/models/arcee.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -830,11 +824,10 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> clone https://github.com/axolotl-ai-cloud/axolotl.git</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> axolotl</span>
 <span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging==26.0 setuptools==75.8.0 wheel ninja</span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn]'</span></span>
-<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy</span></span>
-<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/cutcrossentropy_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn]'</span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/cutcrossentropy_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <ol start="2" type="1">
 <li>Run the finetuning example:</li>
 </ol>
diff --git a/docs/models/devstral.html b/docs/models/devstral.html
index 2e424522c..645addf1b 100644
--- a/docs/models/devstral.html
+++ b/docs/models/devstral.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -830,8 +824,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <p>Here is an example of how to install from pip:</p></li>
 </ol>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ensure you have Pytorch installed (Pytorch 2.6.0 min)</span></span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging==26.0 setuptools==75.8.0 wheel ninja</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <ol start="2" type="1">
 <li>Install <a href="https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy">Cut Cross Entropy</a> to reduce training VRAM usage</li>
 </ol>
diff --git a/docs/models/gemma3n.html b/docs/models/gemma3n.html
index 0b7590103..8acb4cb01 100644
--- a/docs/models/gemma3n.html
+++ b/docs/models/gemma3n.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -825,15 +819,14 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <p>Here is an example of how to install from pip:</p></li>
 </ol>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ensure you have Pytorch installed (Pytorch 2.6.0 min)</span></span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging==26.0 setuptools==75.8.0 wheel ninja</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <ol start="2" type="1">
 <li>In addition to Axolotl’s requirements, Gemma-3n requires:</li>
 </ol>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install timm==1.0.17</span>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install timm==1.0.17</span>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co"># for loading audio data</span></span>
-<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install librosa==0.11.0</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install librosa==0.11.0</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <ol start="3" type="1">
 <li>Download sample dataset files</li>
 </ol>
diff --git a/docs/models/gpt-oss.html b/docs/models/gpt-oss.html
index 3ea3e255e..dc94f4712 100644
--- a/docs/models/gpt-oss.html
+++ b/docs/models/gpt-oss.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -832,8 +826,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <p>Here is an example of how to install from pip:</p></li>
 </ol>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ensure you have Pytorch installed (Pytorch 2.6.0 min)</span></span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging==26.0 setuptools==75.8.0 wheel ninja</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <ol start="2" type="1">
 <li>Choose one of the following configs below for training the 20B model. (for 120B, see <a href="#training-120b">below</a>)</li>
 </ol>
@@ -881,7 +874,7 @@ weights to <code>{output_dir}/merged</code>.</p>
 <p>GPT-OSS support in vLLM does not exist in a stable release yet. See https://x.com/MaziyarPanahi/status/1955741905515323425
 for more information about using a special vllm-openai docker image for inferencing with vLLM.</p>
 <p>Optionally, vLLM can be installed from nightly:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip</span> install <span class="at">--no-build-isolation</span> <span class="at">--pre</span> <span class="at">-U</span> vllm <span class="at">--extra-index-url</span> https://wheels.vllm.ai/nightly</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="at">--pre</span> <span class="at">-U</span> vllm <span class="at">--extra-index-url</span> https://wheels.vllm.ai/nightly</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <p>and the vLLM server can be started with the following command (modify <code>--tensor-parallel-size 8</code> to match your environment):</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb8"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="ex">vllm</span> serve ./outputs/gpt-oss-out/ <span class="at">--served-model-name</span> axolotl/gpt-oss-20b <span class="at">--host</span> 0.0.0.0 <span class="at">--port</span> 8888  <span class="at">--tensor-parallel-size</span> 8</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
diff --git a/docs/models/granite4.html b/docs/models/granite4.html
index 9fee083ad..ab1b209da 100644
--- a/docs/models/granite4.html
+++ b/docs/models/granite4.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -830,11 +824,10 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> clone https://github.com/axolotl-ai-cloud/axolotl.git</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> axolotl</span>
 <span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging==26.0 setuptools==75.8.0 wheel ninja</span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn]'</span></span>
-<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy</span></span>
-<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/cutcrossentropy_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn]'</span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/cutcrossentropy_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <ol start="2" type="1">
 <li>Run the finetuning example:</li>
 </ol>
diff --git a/docs/models/hunyuan.html b/docs/models/hunyuan.html
index b1c2a71a9..71b81b748 100644
--- a/docs/models/hunyuan.html
+++ b/docs/models/hunyuan.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -829,11 +823,10 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> clone https://github.com/axolotl-ai-cloud/axolotl.git</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="bu">cd</span> axolotl</span>
 <span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging==26.0 setuptools==75.8.0 wheel ninja</span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn]'</span></span>
-<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy</span></span>
-<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/cutcrossentropy_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="at">-e</span> <span class="st">'.[flash-attn]'</span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/cutcrossentropy_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <ol start="2" type="1">
 <li>Run the finetuning example:</li>
 </ol>
diff --git a/docs/models/index.html b/docs/models/index.html
index 817065731..ea67c6e0c 100644
--- a/docs/models/index.html
+++ b/docs/models/index.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/internvl3_5.html b/docs/models/internvl3_5.html
index b77749041..24e739e5f 100644
--- a/docs/models/internvl3_5.html
+++ b/docs/models/internvl3_5.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -824,7 +818,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <ol type="1">
 <li><p>Install Axolotl following the <a href="https://docs.axolotl.ai/docs/installation.html">installation guide</a>.</p></li>
 <li><p>Install <code>timm</code> for vision model support:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip</span> install timm==1.0.19</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install timm==1.0.19</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 <li><p>Install <a href="https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy">Cut Cross Entropy</a> to reduce training VRAM usage.</p></li>
 <li><p>Run the finetuning example:</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train examples/internvl3_5/internvl3_5-8b-qlora.yml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
diff --git a/docs/models/jamba.html b/docs/models/jamba.html
index d40059fe7..e4a61a7e8 100644
--- a/docs/models/jamba.html
+++ b/docs/models/jamba.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/kimi-linear.html b/docs/models/kimi-linear.html
index 46ded70a7..ac6ae549e 100644
--- a/docs/models/kimi-linear.html
+++ b/docs/models/kimi-linear.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/llama-2.html b/docs/models/llama-2.html
index 71137c25a..104cd58b1 100644
--- a/docs/models/llama-2.html
+++ b/docs/models/llama-2.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/llama-4.html b/docs/models/llama-4.html
index 855552a7f..e4c9eb99e 100644
--- a/docs/models/llama-4.html
+++ b/docs/models/llama-4.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/magistral.html b/docs/models/magistral.html
index 08f56e682..a73d5db3f 100644
--- a/docs/models/magistral.html
+++ b/docs/models/magistral.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -831,8 +825,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <p>Here is an example of how to install from pip:</p></li>
 </ol>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ensure you have Pytorch installed (Pytorch 2.7.0 min)</span></span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging==26.0 setuptools==75.8.0 wheel ninja</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <ol start="2" type="1">
 <li>Install <a href="https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy">Cut Cross Entropy</a> to reduce training VRAM usage</li>
 </ol>
diff --git a/docs/models/magistral/think.html b/docs/models/magistral/think.html
index bb3d226dc..edb3e06dc 100644
--- a/docs/models/magistral/think.html
+++ b/docs/models/magistral/think.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/magistral/vision.html b/docs/models/magistral/vision.html
index 0f3e1f79b..91d35192d 100644
--- a/docs/models/magistral/vision.html
+++ b/docs/models/magistral/vision.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -830,7 +824,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <h2 class="anchored" data-anchor-id="getting-started">Getting started</h2>
 <ol type="1">
 <li><p>Install the required vision lib:
-<code>bash  pip install 'mistral-common[opencv]==1.8.5'</code></p></li>
+<code>bash  uv pip install 'mistral-common[opencv]==1.8.5'</code></p></li>
 <li><p>Download the example dataset image:</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">wget</span> https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 <li><p>Run the fine-tuning:</p>
diff --git a/docs/models/mimo.html b/docs/models/mimo.html
index b1a0894bc..ac8d1076b 100644
--- a/docs/models/mimo.html
+++ b/docs/models/mimo.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/ministral.html b/docs/models/ministral.html
index 86e5b7785..33fcf29e2 100644
--- a/docs/models/ministral.html
+++ b/docs/models/ministral.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/ministral3.html b/docs/models/ministral3.html
index 04b46cd01..abb5e8903 100644
--- a/docs/models/ministral3.html
+++ b/docs/models/ministral3.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -837,7 +831,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> checkout transformers-v5</span>
 <span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Install packages for transformers v5</span></span>
-<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="ex">pip</span> install <span class="at">-e</span> .</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">-e</span> .</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 <li><p>Run the fine-tuning:</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train ministral3-3b-qlora.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 </ol>
diff --git a/docs/models/ministral3/think.html b/docs/models/ministral3/think.html
index 280c942d3..e949b8c0c 100644
--- a/docs/models/ministral3/think.html
+++ b/docs/models/ministral3/think.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/ministral3/vision.html b/docs/models/ministral3/vision.html
index 99c1f4102..0425a34dc 100644
--- a/docs/models/ministral3/vision.html
+++ b/docs/models/ministral3/vision.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -830,7 +824,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <h2 class="anchored" data-anchor-id="getting-started">Getting started</h2>
 <ol type="1">
 <li><p>Install the required vision lib:
-<code>bash  pip install 'mistral-common[opencv]==1.8.6'</code></p></li>
+<code>bash  uv pip install 'mistral-common[opencv]==1.8.6'</code></p></li>
 <li><p>Download the example dataset image:</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">wget</span> https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 <li><p>Run the fine-tuning:</p>
diff --git a/docs/models/mistral-small.html b/docs/models/mistral-small.html
index d2f385550..59eff9e97 100644
--- a/docs/models/mistral-small.html
+++ b/docs/models/mistral-small.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -827,7 +821,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <h2 class="anchored" data-anchor-id="getting-started">Getting Started</h2>
 <ol type="1">
 <li><p>Install the required vision lib:
-<code>bash  pip install 'mistral-common[opencv]==1.8.5'</code></p></li>
+<code>bash  uv pip install 'mistral-common[opencv]==1.8.5'</code></p></li>
 <li><p>Download the example dataset image:</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">wget</span> https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 <li><p>Run the fine-tuning:</p>
diff --git a/docs/models/mistral.html b/docs/models/mistral.html
index e57dc9961..1f9e9c343 100644
--- a/docs/models/mistral.html
+++ b/docs/models/mistral.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/olmo3.html b/docs/models/olmo3.html
index 6ed27159d..79a60ae16 100644
--- a/docs/models/olmo3.html
+++ b/docs/models/olmo3.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/orpheus.html b/docs/models/orpheus.html
index 95a1cb447..9bb19062b 100644
--- a/docs/models/orpheus.html
+++ b/docs/models/orpheus.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/phi.html b/docs/models/phi.html
index 48e5c8707..e477fc26d 100644
--- a/docs/models/phi.html
+++ b/docs/models/phi.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/plano.html b/docs/models/plano.html
index c62a4470d..d9c4021d4 100644
--- a/docs/models/plano.html
+++ b/docs/models/plano.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/qwen3-next.html b/docs/models/qwen3-next.html
index f9f99bc6b..af76b6383 100644
--- a/docs/models/qwen3-next.html
+++ b/docs/models/qwen3-next.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -826,7 +820,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <li><p>Install <a href="https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy">Cut Cross Entropy</a> to reduce training VRAM usage.</p></li>
 <li><p>Install FLA for improved performance</p></li>
 </ol>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> uninstall <span class="at">-y</span> causal-conv1d <span class="kw">&amp;&amp;</span> <span class="ex">pip3</span> install flash-linear-attention==0.4.1</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip uninstall causal-conv1d <span class="kw">&amp;&amp;</span> <span class="ex">uv</span> pip install flash-linear-attention==0.4.1</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <ol start="4" type="1">
 <li>Run the finetuning example:</li>
 </ol>
diff --git a/docs/models/qwen3.html b/docs/models/qwen3.html
index f16e91ca2..892376051 100644
--- a/docs/models/qwen3.html
+++ b/docs/models/qwen3.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/seed-oss.html b/docs/models/seed-oss.html
index 4d835674b..c5edf2178 100644
--- a/docs/models/seed-oss.html
+++ b/docs/models/seed-oss.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -825,11 +819,10 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <li><p>Install Axolotl following the <a href="https://docs.axolotl.ai/docs/installation.html">installation guide</a>.</p>
 <p>Here is an example of how to install from pip:</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ensure you have a compatible version of Pytorch installed</span></span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging setuptools wheel ninja</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Install Cut Cross Entropy</span></span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/cutcrossentropy_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Install Cut Cross Entropy</span></span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/cutcrossentropy_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 <li><p>Run the finetuning example:</p></li>
 </ol>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train examples/seed-oss/seed-oss-36b-qlora.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
diff --git a/docs/models/smolvlm2.html b/docs/models/smolvlm2.html
index 69c53e7eb..6d7bf1dff 100644
--- a/docs/models/smolvlm2.html
+++ b/docs/models/smolvlm2.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -824,10 +818,9 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <li><p>Install Axolotl following the <a href="https://docs.axolotl.ai/docs/installation.html">installation guide</a>.</p>
 <p>Here is an example of how to install from pip:</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ensure you have a compatible version of Pytorch installed</span></span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging setuptools wheel ninja</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 <li><p>Install an extra dependency:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install num2words==0.5.14</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install num2words==0.5.14</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
 <li><p>Run the finetuning example:</p>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># LoRA SFT (1x48GB @ 6.8GiB)</span></span>
 <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train examples/smolvlm2/smolvlm2-2B-lora.yaml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div></li>
diff --git a/docs/models/trinity.html b/docs/models/trinity.html
index a27b54252..eb92e2cd4 100644
--- a/docs/models/trinity.html
+++ b/docs/models/trinity.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/models/voxtral.html b/docs/models/voxtral.html
index 3a5b6db9b..cbcf81ef2 100644
--- a/docs/models/voxtral.html
+++ b/docs/models/voxtral.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -828,14 +822,13 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <p>Here is an example of how to install from pip:</p></li>
 </ol>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ensure you have Pytorch installed (Pytorch 2.6.0 min)</span></span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install packaging==26.0 setuptools==75.8.0 wheel ninja</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> <span class="st">'axolotl[flash-attn]&gt;=0.12.0'</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <ol start="2" type="1">
 <li>Please install the below.</li>
 </ol>
 <div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># audio</span></span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install librosa==0.11.0</span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="st">'mistral_common[audio]==1.8.3'</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install librosa==0.11.0</span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="st">'mistral_common[audio]==1.8.3'</span></span>
 <span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy</span></span>
 <span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/cutcrossentropy_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
diff --git a/docs/multi-gpu.html b/docs/multi-gpu.html
index 42f7124c7..0979cac7c 100644
--- a/docs/multi-gpu.html
+++ b/docs/multi-gpu.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/multi-node.html b/docs/multi-node.html
index b36d791c2..336e7a761 100644
--- a/docs/multi-node.html
+++ b/docs/multi-node.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/multimodal.html b/docs/multimodal.html
index b74ed570d..e63a5f3bc 100644
--- a/docs/multimodal.html
+++ b/docs/multimodal.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/multipack.html b/docs/multipack.html
index cb5d2c64c..43ecf39c6 100644
--- a/docs/multipack.html
+++ b/docs/multipack.html
@@ -662,12 +662,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/nccl.html b/docs/nccl.html
index 1e3a44a2b..8a0bd2156 100644
--- a/docs/nccl.html
+++ b/docs/nccl.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/nd_parallelism.html b/docs/nd_parallelism.html
index 078b9dc09..5f46495e2 100644
--- a/docs/nd_parallelism.html
+++ b/docs/nd_parallelism.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/optimizations.html b/docs/optimizations.html
index 0eb0f5efe..9ec9b4f9c 100644
--- a/docs/optimizations.html
+++ b/docs/optimizations.html
@@ -662,12 +662,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/optimizers.html b/docs/optimizers.html
index 5f0b31a81..3f80865ed 100644
--- a/docs/optimizers.html
+++ b/docs/optimizers.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/qat.html b/docs/qat.html
index 4443aa19e..3f14ec56b 100644
--- a/docs/qat.html
+++ b/docs/qat.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/quantize.html b/docs/quantize.html
index fa4078592..4d3f07102 100644
--- a/docs/quantize.html
+++ b/docs/quantize.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/ray-integration.html b/docs/ray-integration.html
index 9ba768171..912ce100e 100644
--- a/docs/ray-integration.html
+++ b/docs/ray-integration.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/reward_modelling.html b/docs/reward_modelling.html
index 011f4ba5b..a212986a2 100644
--- a/docs/reward_modelling.html
+++ b/docs/reward_modelling.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/rlhf.html b/docs/rlhf.html
index 3dd557ef0..8dbac2e52 100644
--- a/docs/rlhf.html
+++ b/docs/rlhf.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/sequence_parallelism.html b/docs/sequence_parallelism.html
index ec9fe6d86..91b975d7d 100644
--- a/docs/sequence_parallelism.html
+++ b/docs/sequence_parallelism.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/streaming.html b/docs/streaming.html
index ac014b185..974373845 100644
--- a/docs/streaming.html
+++ b/docs/streaming.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/telemetry.html b/docs/telemetry.html
index 47fe30110..e16cf4092 100644
--- a/docs/telemetry.html
+++ b/docs/telemetry.html
@@ -662,12 +662,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/torchao.html b/docs/torchao.html
index 1ed81ed45..b33be10db 100644
--- a/docs/torchao.html
+++ b/docs/torchao.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/training_stability.html b/docs/training_stability.html
index 019127869..74f91d1f2 100644
--- a/docs/training_stability.html
+++ b/docs/training_stability.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/unsloth.html b/docs/unsloth.html
deleted file mode 100644
index 7e505e90d..000000000
--- a/docs/unsloth.html
+++ /dev/null
@@ -1,1281 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
-
-<meta charset="utf-8">
-<meta name="generator" content="quarto-1.9.37">
-
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
-
-<meta name="description" content="Hyper-optimized QLoRA finetuning for single GPUs">
-
-<title>Unsloth – Axolotl</title>
-<style>
-/* Default styles provided by pandoc.
-** See https://pandoc.org/MANUAL.html#variables-for-html for config info.
-*/
-code{white-space: pre-wrap;}
-span.smallcaps{font-variant: small-caps;}
-div.columns{display: flex; gap: min(4vw, 1.5em);}
-div.column{flex: auto; overflow-x: auto;}
-div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
-ul.task-list{list-style: none;}
-ul.task-list li input[type="checkbox"] {
-  width: 0.8em;
-  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
-  vertical-align: middle;
-}
-/* CSS for syntax highlighting */
-html { -webkit-text-size-adjust: 100%; }
-pre > code.sourceCode { white-space: pre; position: relative; }
-pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
-pre > code.sourceCode > span:empty { height: 1.2em; }
-.sourceCode { overflow: visible; }
-code.sourceCode > span { color: inherit; text-decoration: inherit; }
-div.sourceCode { margin: 1em 0; }
-pre.sourceCode { margin: 0; }
-@media screen {
-div.sourceCode { overflow: auto; }
-}
-@media print {
-pre > code.sourceCode { white-space: pre-wrap; }
-pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
-}
-pre.numberSource code
-  { counter-reset: source-line 0; }
-pre.numberSource code > span
-  { position: relative; left: -4em; counter-increment: source-line; }
-pre.numberSource code > span > a:first-child::before
-  { content: counter(source-line);
-    position: relative; left: -1em; text-align: right; vertical-align: baseline;
-    border: none; display: inline-block;
-    -webkit-touch-callout: none; -webkit-user-select: none;
-    -khtml-user-select: none; -moz-user-select: none;
-    -ms-user-select: none; user-select: none;
-    padding: 0 4px; width: 4em;
-  }
-pre.numberSource { margin-left: 3em;  padding-left: 4px; }
-div.sourceCode
-  {   }
-@media screen {
-pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
-}
-</style>
-
-
-<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
-<script src="../site_libs/clipboard/clipboard.min.js"></script>
-<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
-<script src="../site_libs/quarto-search/fuse.min.js"></script>
-<script src="../site_libs/quarto-search/quarto-search.js"></script>
-<meta name="quarto:offset" content="../">
-<link href="../favicon.jpg" rel="icon" type="image/jpeg">
-<script src="../site_libs/quarto-html/quarto.js" type="module"></script>
-<script src="../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
-<script src="../site_libs/quarto-html/popper.min.js"></script>
-<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
-<script src="../site_libs/quarto-html/anchor.min.js"></script>
-<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
-<link href="../site_libs/quarto-html/quarto-syntax-highlighting-dark-d0ae9245876894da5ac7e18953ecc5cc.css" rel="stylesheet" id="quarto-text-highlighting-styles">
-<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
-<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
-<link href="../site_libs/bootstrap/bootstrap-ab6ebd6eb475c4578b58908bc314f719.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
-<script id="quarto-search-options" type="application/json">{
-  "location": "navbar",
-  "copy-button": false,
-  "collapse-after": 3,
-  "panel-placement": "end",
-  "type": "overlay",
-  "limit": 50,
-  "keyboard-shortcut": [
-    "f",
-    "/",
-    "s"
-  ],
-  "show-item-context": false,
-  "language": {
-    "search-no-results-text": "No results",
-    "search-matching-documents-text": "matching documents",
-    "search-copy-link-title": "Copy link to search",
-    "search-hide-matches-text": "Hide additional matches",
-    "search-more-match-text": "more match in this document",
-    "search-more-matches-text": "more matches in this document",
-    "search-clear-button-title": "Clear",
-    "search-text-placeholder": "",
-    "search-detached-cancel-button-title": "Cancel",
-    "search-submit-button-title": "Submit",
-    "search-label": "Search"
-  }
-}</script>
-<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-9KYCVJBNMQ"></script>
-
-<script type="text/javascript">
-
-window.dataLayer = window.dataLayer || [];
-function gtag(){dataLayer.push(arguments);}
-gtag('js', new Date());
-gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
-</script>
-
-
-<link rel="stylesheet" href="../styles.css">
-</head>
-
-<body class="nav-sidebar docked nav-fixed quarto-light">
-
-<div id="quarto-search-results"></div>
-  <header id="quarto-header" class="headroom fixed-top">
-    <nav class="navbar navbar-expand " data-bs-theme="dark">
-      <div class="navbar-container container-fluid">
-      <div class="navbar-brand-container mx-auto">
-    <a href="../index.html" class="navbar-brand navbar-brand-logo">
-    <img src="../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo light-content">
-    <img src="../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo dark-content">
-    </a>
-  </div>
-        <div class="quarto-navbar-tools tools-wide tools-end">
-    <a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
-    <a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
-    <a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
-</div>
-          <div id="quarto-search" class="" title="Search"></div>
-      </div> <!-- /container-fluid -->
-    </nav>
-  <nav class="quarto-secondary-nav">
-    <div class="container-fluid d-flex">
-      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
-        <i class="bi bi-layout-text-sidebar-reverse"></i>
-      </button>
-        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/fsdp_qlora.html">Advanced Features</a></li><li class="breadcrumb-item"><a href="../docs/unsloth.html">Unsloth</a></li></ol></nav>
-        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
-        </a>
-    </div>
-  </nav>
-</header>
-<!-- content -->
-<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
-<!-- sidebar -->
-  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
-    <div class="sidebar-menu-container"> 
-    <ul class="list-unstyled mt-1">
-        <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../index.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Home</span></a>
-  </div>
-</li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
- <span class="menu-text">Getting Started</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/getting-started.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Quickstart</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/choosing_method.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Which Fine-Tuning Method Should I Use?</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/installation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Installation</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/inference.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Inference and Merging</span></a>
-  </div>
-</li>
-          <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false">
- <span class="menu-text">Model Guides</span></a>
-          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="false" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth2 ">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/kimi-linear.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Kimi Linear</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/plano.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Plano Orchestrator</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/mimo.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">MiMo</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/internvl3_5.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">InternVL 3.5</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/olmo3.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">OLMo 3</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/trinity.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Trinity</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/arcee.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Arcee AFM</span></a>
-  </div>
-</li>
-          <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false">
- <span class="menu-text">Ministral3</span></a>
-          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="false" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth3 ">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/ministral3.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Ministral3</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/ministral3/think.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Ministral 3 Thinking</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/ministral3/vision.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Ministral 3 Vision</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-          <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false">
- <span class="menu-text">Magistral</span></a>
-          <a class="sidebar-item-toggle text-start collapsed" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="false" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth3 ">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/magistral.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Magistral</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/magistral/think.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Magistral Thinking</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/magistral/vision.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Magistral Vision</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/ministral.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Ministral</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/mistral-small.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Mistral Small 3.1/3.2</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/voxtral.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Voxtral</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/devstral.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Devstral</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/mistral.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Mistral 7B</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/llama-4.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Llama 4</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/llama-2.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Llama 2</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/qwen3-next.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Qwen 3 Next</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/qwen3.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Qwen 3</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/gemma3n.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Gemma 3n</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/apertus.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Apertus</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/gpt-oss.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">GPT-OSS</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/seed-oss.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Seed-OSS</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/phi.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Phi</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/smolvlm2.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">SmolVLM 2</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/granite4.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Granite 4</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/LiquidAI.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Liquid Foundation Models 2</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/hunyuan.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Hunyuan</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/jamba.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Jamba</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/models/orpheus.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Orpheus</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/cli.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Command Line Interface (CLI)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/telemetry.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Telemetry</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/config-reference.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Config Reference</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/api" class="sidebar-item-text sidebar-link">
- <span class="menu-text">API Reference</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a href="../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Dataset Formats</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Pre-training</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Instruction Tuning</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Conversation</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Stepwise Supervised Format</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Template-Free</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
- <span class="menu-text">Deployments</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/docker.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Docker</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/multi-gpu.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Multi-GPU</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/multi-node.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Multi Node</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Ray Train</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">AMD GPUs on HPC Systems</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/mac.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Mac M-series</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true">
- <span class="menu-text">How To Guides</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-7" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/multimodal.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">MultiModal / Vision Language Models (BETA)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/rlhf.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">RLHF (Beta)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/grpo.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">GRPO Training</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/ebft.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">EBFT Training</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/vllm_serving.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">vLLM Serving for GRPO Training</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/reward_modelling.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Reward Modelling</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/lr_groups.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Learning Rate Groups</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/lora_optims.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">LoRA Optimizations</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/dataset_loading.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Dataset Loading</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/qat.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Quantization Aware Training (QAT)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/quantize.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Quantization with torchao</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/optimizations.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Optimizations Guide</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true">
- <span class="menu-text">Core Concepts</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-8" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-8" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/batch_vs_grad.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Batch size vs Gradient accumulation</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Dataset Preprocessing</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Streaming Datasets</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/multipack.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Multipack (Sample Packing)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/mixed_precision.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Mixed Precision Training</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Optimizers</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/attention.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Attention</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true">
- <span class="menu-text">Advanced Features</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-9" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-9" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">FSDP + QLoRA</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link active">
- <span class="menu-text">Unsloth</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/torchao.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">PyTorch ao</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/custom_integrations.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Custom Integrations</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/sequence_parallelism.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Sequence Parallelism</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Gradient Checkpointing, Activation Offloading, and Layer Offloading</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">N-D Parallelism (Beta)</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/expert_quantization.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">MoE Expert Quantization</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-        <li class="sidebar-item sidebar-item-section">
-      <div class="sidebar-item-container"> 
-            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true">
- <span class="menu-text">Troubleshooting</span></a>
-          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-10" role="navigation" aria-expanded="true" aria-label="Toggle section">
-            <i class="bi bi-chevron-right ms-2"></i>
-          </a> 
-      </div>
-      <ul id="quarto-sidebar-section-10" class="collapse list-unstyled sidebar-section depth1 show">  
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/faq.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">FAQ</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/training_stability.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Training Stability &amp; Debugging</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/debugging.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Debugging</span></a>
-  </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/nccl.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">NCCL</span></a>
-  </div>
-</li>
-      </ul>
-  </li>
-    </ul>
-    </div>
-</nav>
-<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
-<!-- margin-sidebar -->
-    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
-        <nav id="TOC" role="doc-toc" class="toc-active">
-    <h2 id="toc-title">On this page</h2>
-   
-  <ul>
-  <li><a href="#overview" id="toc-overview" class="nav-link active" data-scroll-target="#overview">Overview</a></li>
-  <li><a href="#installation" id="toc-installation" class="nav-link" data-scroll-target="#installation">Installation</a></li>
-  <li><a href="#usage" id="toc-usage" class="nav-link" data-scroll-target="#usage">Usage</a></li>
-  <li><a href="#limitations" id="toc-limitations" class="nav-link" data-scroll-target="#limitations">Limitations</a></li>
-  </ul>
-</nav>
-    </div>
-<!-- main -->
-<main class="content" id="quarto-document-content">
-
-<header id="title-block-header" class="quarto-title-block default"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/fsdp_qlora.html">Advanced Features</a></li><li class="breadcrumb-item"><a href="../docs/unsloth.html">Unsloth</a></li></ol></nav>
-<div class="quarto-title">
-<h1 class="title">Unsloth</h1>
-</div>
-
-<div>
-  <div class="description">
-    Hyper-optimized QLoRA finetuning for single GPUs
-  </div>
-</div>
-
-
-<div class="quarto-title-meta">
-
-    
-  
-    
-  </div>
-  
-
-
-</header>
-
-
-<section id="overview" class="level3">
-<h3 class="anchored" data-anchor-id="overview">Overview</h3>
-<p>Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over
-standard industry baselines.</p>
-<div class="callout callout-style-default callout-important callout-titled">
-<div class="callout-header d-flex align-content-center">
-<div class="callout-icon-container">
-<i class="callout-icon"></i>
-</div>
-<div class="callout-title-container flex-fill">
-Important
-</div>
-</div>
-<div class="callout-body-container callout-body">
-<p>Due to breaking changes in transformers <code>v4.48.0</code>, users will need to downgrade to <code>&lt;=v4.47.1</code> to use this patch.</p>
-<p>This will later be deprecated in favor of <a href="../docs/lora_optims.html">LoRA Optimizations</a>.</p>
-</div>
-</div>
-</section>
-<section id="installation" class="level3">
-<h3 class="anchored" data-anchor-id="installation">Installation</h3>
-<p>The following will install the correct unsloth and extras from source.</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="ex">python</span> scripts/unsloth_install.py <span class="kw">|</span> <span class="fu">sh</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-</section>
-<section id="usage" class="level3">
-<h3 class="anchored" data-anchor-id="usage">Usage</h3>
-<p>Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains.</p>
-<p>Our unsloth integration is currently limited to the following model architectures:
-- llama</p>
-<p>These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_mlp</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_qkv</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_o</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-<p>These options are composable and can be used with multi-gpu finetuning</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_cross_entropy_loss</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rms_norm</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rope</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-</section>
-<section id="limitations" class="level3">
-<h3 class="anchored" data-anchor-id="limitations">Limitations</h3>
-<ul>
-<li>Single GPU only; e.g.&nbsp;no multi-gpu support</li>
-<li>No deepspeed or FSDP support (requires multi-gpu)</li>
-<li>LoRA + QLoRA support only. No full fine tunes or fp8 support.</li>
-<li>Limited model architecture support. Llama, Phi, Gemma, Mistral only</li>
-<li>No MoE support.</li>
-</ul>
-
-
-</section>
-
-</main> <!-- /main -->
-<script id="quarto-html-after-body" type="application/javascript">
-  window.document.addEventListener("DOMContentLoaded", function (event) {
-    const icon = "";
-    const anchorJS = new window.AnchorJS();
-    anchorJS.options = {
-      placement: 'right',
-      icon: icon
-    };
-    anchorJS.add('.anchored');
-    const isCodeAnnotation = (el) => {
-      for (const clz of el.classList) {
-        if (clz.startsWith('code-annotation-')) {                     
-          return true;
-        }
-      }
-      return false;
-    }
-    const onCopySuccess = function(e) {
-      // button target
-      const button = e.trigger;
-      // don't keep focus
-      button.blur();
-      // flash "checked"
-      button.classList.add('code-copy-button-checked');
-      var currentTitle = button.getAttribute("title");
-      button.setAttribute("title", "Copied!");
-      let tooltip;
-      if (window.bootstrap) {
-        button.setAttribute("data-bs-toggle", "tooltip");
-        button.setAttribute("data-bs-placement", "left");
-        button.setAttribute("data-bs-title", "Copied!");
-        tooltip = new bootstrap.Tooltip(button, 
-          { trigger: "manual", 
-            customClass: "code-copy-button-tooltip",
-            offset: [0, -8]});
-        tooltip.show();    
-      }
-      setTimeout(function() {
-        if (tooltip) {
-          tooltip.hide();
-          button.removeAttribute("data-bs-title");
-          button.removeAttribute("data-bs-toggle");
-          button.removeAttribute("data-bs-placement");
-        }
-        button.setAttribute("title", currentTitle);
-        button.classList.remove('code-copy-button-checked');
-      }, 1000);
-      // clear code selection
-      e.clearSelection();
-    }
-    const getTextToCopy = function(trigger) {
-      const outerScaffold = trigger.parentElement.cloneNode(true);
-      const codeEl = outerScaffold.querySelector('code');
-      for (const childEl of codeEl.children) {
-        if (isCodeAnnotation(childEl)) {
-          childEl.remove();
-        }
-      }
-      return codeEl.innerText;
-    }
-    const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
-      text: getTextToCopy
-    });
-    clipboard.on('success', onCopySuccess);
-    if (window.document.getElementById('quarto-embedded-source-code-modal')) {
-      const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
-        text: getTextToCopy,
-        container: window.document.getElementById('quarto-embedded-source-code-modal')
-      });
-      clipboardModal.on('success', onCopySuccess);
-    }
-      var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
-      var mailtoRegex = new RegExp(/^mailto:/);
-        var filterRegex = new RegExp("https:\/\/docs\.axolotl\.ai");
-      var isInternal = (href) => {
-          return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
-      }
-      // Inspect non-navigation links and adorn them if external
-     var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
-      for (var i=0; i<links.length; i++) {
-        const link = links[i];
-        if (!isInternal(link.href)) {
-          // undo the damage that might have been done by quarto-nav.js in the case of
-          // links that we want to consider external
-          if (link.dataset.originalHref !== undefined) {
-            link.href = link.dataset.originalHref;
-          }
-        }
-      }
-    function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
-      const config = {
-        allowHTML: true,
-        maxWidth: 500,
-        delay: 100,
-        arrow: false,
-        appendTo: function(el) {
-            return el.parentElement;
-        },
-        interactive: true,
-        interactiveBorder: 10,
-        theme: 'quarto',
-        placement: 'bottom-start',
-      };
-      if (contentFn) {
-        config.content = contentFn;
-      }
-      if (onTriggerFn) {
-        config.onTrigger = onTriggerFn;
-      }
-      if (onUntriggerFn) {
-        config.onUntrigger = onUntriggerFn;
-      }
-      window.tippy(el, config); 
-    }
-    const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
-    for (var i=0; i<noterefs.length; i++) {
-      const ref = noterefs[i];
-      tippyHover(ref, function() {
-        // use id or data attribute instead here
-        let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
-        try { href = new URL(href).hash; } catch {}
-        const id = href.replace(/^#\/?/, "");
-        const note = window.document.getElementById(id);
-        if (note) {
-          return note.innerHTML;
-        } else {
-          return "";
-        }
-      });
-    }
-    const xrefs = window.document.querySelectorAll('a.quarto-xref');
-    const processXRef = (id, note) => {
-      // Strip column container classes
-      const stripColumnClz = (el) => {
-        el.classList.remove("page-full", "page-columns");
-        if (el.children) {
-          for (const child of el.children) {
-            stripColumnClz(child);
-          }
-        }
-      }
-      stripColumnClz(note)
-      if (id === null || id.startsWith('sec-')) {
-        // Special case sections, only their first couple elements
-        const container = document.createElement("div");
-        if (note.children && note.children.length > 2) {
-          container.appendChild(note.children[0].cloneNode(true));
-          for (let i = 1; i < note.children.length; i++) {
-            const child = note.children[i];
-            if (child.tagName === "P" && child.innerText === "") {
-              continue;
-            } else {
-              container.appendChild(child.cloneNode(true));
-              break;
-            }
-          }
-          if (window.Quarto?.typesetMath) {
-            window.Quarto.typesetMath(container);
-          }
-          return container.innerHTML
-        } else {
-          if (window.Quarto?.typesetMath) {
-            window.Quarto.typesetMath(note);
-          }
-          return note.innerHTML;
-        }
-      } else {
-        // Remove any anchor links if they are present
-        const anchorLink = note.querySelector('a.anchorjs-link');
-        if (anchorLink) {
-          anchorLink.remove();
-        }
-        if (window.Quarto?.typesetMath) {
-          window.Quarto.typesetMath(note);
-        }
-        if (note.classList.contains("callout")) {
-          return note.outerHTML;
-        } else {
-          return note.innerHTML;
-        }
-      }
-    }
-    for (var i=0; i<xrefs.length; i++) {
-      const xref = xrefs[i];
-      tippyHover(xref, undefined, function(instance) {
-        instance.disable();
-        let url = xref.getAttribute('href');
-        let hash = undefined; 
-        if (url.startsWith('#')) {
-          hash = url;
-        } else {
-          try { hash = new URL(url).hash; } catch {}
-        }
-        if (hash) {
-          const id = hash.replace(/^#\/?/, "");
-          const note = window.document.getElementById(id);
-          if (note !== null) {
-            try {
-              const html = processXRef(id, note.cloneNode(true));
-              instance.setContent(html);
-            } finally {
-              instance.enable();
-              instance.show();
-            }
-          } else {
-            // See if we can fetch this
-            fetch(url.split('#')[0])
-            .then(res => res.text())
-            .then(html => {
-              const parser = new DOMParser();
-              const htmlDoc = parser.parseFromString(html, "text/html");
-              const note = htmlDoc.getElementById(id);
-              if (note !== null) {
-                const html = processXRef(id, note);
-                instance.setContent(html);
-              } 
-            }).finally(() => {
-              instance.enable();
-              instance.show();
-            });
-          }
-        } else {
-          // See if we can fetch a full url (with no hash to target)
-          // This is a special case and we should probably do some content thinning / targeting
-          fetch(url)
-          .then(res => res.text())
-          .then(html => {
-            const parser = new DOMParser();
-            const htmlDoc = parser.parseFromString(html, "text/html");
-            const note = htmlDoc.querySelector('main.content');
-            if (note !== null) {
-              // This should only happen for chapter cross references
-              // (since there is no id in the URL)
-              // remove the first header
-              if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
-                note.children[0].remove();
-              }
-              const html = processXRef(null, note);
-              instance.setContent(html);
-            } 
-          }).finally(() => {
-            instance.enable();
-            instance.show();
-          });
-        }
-      }, function(instance) {
-      });
-    }
-        let selectedAnnoteEl;
-        const selectorForAnnotation = ( cell, annotation) => {
-          let cellAttr = 'data-code-cell="' + cell + '"';
-          let lineAttr = 'data-code-annotation="' +  annotation + '"';
-          const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
-          return selector;
-        }
-        const selectCodeLines = (annoteEl) => {
-          const doc = window.document;
-          const targetCell = annoteEl.getAttribute("data-target-cell");
-          const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
-          const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
-          const lines = annoteSpan.getAttribute("data-code-lines").split(",");
-          const lineIds = lines.map((line) => {
-            return targetCell + "-" + line;
-          })
-          let top = null;
-          let height = null;
-          let parent = null;
-          if (lineIds.length > 0) {
-              //compute the position of the single el (top and bottom and make a div)
-              const el = window.document.getElementById(lineIds[0]);
-              top = el.offsetTop;
-              height = el.offsetHeight;
-              parent = el.parentElement.parentElement;
-            if (lineIds.length > 1) {
-              const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
-              const bottom = lastEl.offsetTop + lastEl.offsetHeight;
-              height = bottom - top;
-            }
-            if (top !== null && height !== null && parent !== null) {
-              // cook up a div (if necessary) and position it 
-              let div = window.document.getElementById("code-annotation-line-highlight");
-              if (div === null) {
-                div = window.document.createElement("div");
-                div.setAttribute("id", "code-annotation-line-highlight");
-                div.style.position = 'absolute';
-                parent.appendChild(div);
-              }
-              div.style.top = top - 2 + "px";
-              div.style.height = height + 4 + "px";
-              div.style.left = 0;
-              let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
-              if (gutterDiv === null) {
-                gutterDiv = window.document.createElement("div");
-                gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
-                gutterDiv.style.position = 'absolute';
-                const codeCell = window.document.getElementById(targetCell);
-                const gutter = codeCell.querySelector('.code-annotation-gutter');
-                gutter.appendChild(gutterDiv);
-              }
-              gutterDiv.style.top = top - 2 + "px";
-              gutterDiv.style.height = height + 4 + "px";
-            }
-            selectedAnnoteEl = annoteEl;
-          }
-        };
-        const unselectCodeLines = () => {
-          const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
-          elementsIds.forEach((elId) => {
-            const div = window.document.getElementById(elId);
-            if (div) {
-              div.remove();
-            }
-          });
-          selectedAnnoteEl = undefined;
-        };
-          // Handle positioning of the toggle
-      window.addEventListener(
-        "resize",
-        throttle(() => {
-          elRect = undefined;
-          if (selectedAnnoteEl) {
-            selectCodeLines(selectedAnnoteEl);
-          }
-        }, 10)
-      );
-      function throttle(fn, ms) {
-      let throttle = false;
-      let timer;
-        return (...args) => {
-          if(!throttle) { // first call gets through
-              fn.apply(this, args);
-              throttle = true;
-          } else { // all the others get throttled
-              if(timer) clearTimeout(timer); // cancel #2
-              timer = setTimeout(() => {
-                fn.apply(this, args);
-                timer = throttle = false;
-              }, ms);
-          }
-        };
-      }
-        // Attach click handler to the DT
-        const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
-        for (const annoteDlNode of annoteDls) {
-          annoteDlNode.addEventListener('click', (event) => {
-            const clickedEl = event.target;
-            if (clickedEl !== selectedAnnoteEl) {
-              unselectCodeLines();
-              const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
-              if (activeEl) {
-                activeEl.classList.remove('code-annotation-active');
-              }
-              selectCodeLines(clickedEl);
-              clickedEl.classList.add('code-annotation-active');
-            } else {
-              // Unselect the line
-              unselectCodeLines();
-              clickedEl.classList.remove('code-annotation-active');
-            }
-          });
-        }
-    const findCites = (el) => {
-      const parentEl = el.parentElement;
-      if (parentEl) {
-        const cites = parentEl.dataset.cites;
-        if (cites) {
-          return {
-            el,
-            cites: cites.split(' ')
-          };
-        } else {
-          return findCites(el.parentElement)
-        }
-      } else {
-        return undefined;
-      }
-    };
-    var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
-    for (var i=0; i<bibliorefs.length; i++) {
-      const ref = bibliorefs[i];
-      const citeInfo = findCites(ref);
-      if (citeInfo) {
-        tippyHover(citeInfo.el, function() {
-          var popup = window.document.createElement('div');
-          citeInfo.cites.forEach(function(cite) {
-            var citeDiv = window.document.createElement('div');
-            citeDiv.classList.add('hanging-indent');
-            citeDiv.classList.add('csl-entry');
-            var biblioDiv = window.document.getElementById('ref-' + cite);
-            if (biblioDiv) {
-              citeDiv.innerHTML = biblioDiv.innerHTML;
-            }
-            popup.appendChild(citeDiv);
-          });
-          return popup.innerHTML;
-        });
-      }
-    }
-  });
-  </script>
-</div> <!-- /content -->
-
-
-
-
-</body></html>
\ No newline at end of file
diff --git a/docs/vllm_serving.html b/docs/vllm_serving.html
index eb113b9de..3693be8e4 100644
--- a/docs/vllm_serving.html
+++ b/docs/vllm_serving.html
@@ -697,12 +697,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/examples/colab-notebooks/colab-axolotl-example.html b/examples/colab-notebooks/colab-axolotl-example.html
index 708128d57..d9e88e3fe 100644
--- a/examples/colab-notebooks/colab-axolotl-example.html
+++ b/examples/colab-notebooks/colab-axolotl-example.html
@@ -700,12 +700,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/index.html b/index.html
index bf9367842..fd06a0b7d 100644
--- a/index.html
+++ b/index.html
@@ -696,12 +696,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="./docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="./docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -920,45 +914,26 @@ Expand older updates
 </section>
 <section id="installation" class="level3">
 <h3 class="anchored" data-anchor-id="installation">Installation</h3>
-<section id="using-uv-recommended" class="level4">
-<h4 class="anchored" data-anchor-id="using-uv-recommended">Using uv (recommended)</h4>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># install uv if you don't already have it installed</span></span>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># install uv if you don't already have it installed (restart shell after)</span></span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="ex">curl</span> <span class="at">-LsSf</span> https://astral.sh/uv/install.sh <span class="kw">|</span> <span class="fu">sh</span></span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> <span class="va">$HOME</span>/.local/bin/env</span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="co"># CUDA 12.8.1 tends to have better package compatibility</span></span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">UV_TORCH_BACKEND</span><span class="op">=</span>cu128</span>
-<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="co"># create a new virtual environment</span></span>
-<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> venv <span class="at">--python</span> 3.12</span>
-<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> .venv/bin/activate</span>
-<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install torch==2.10.0 torchvision</span>
-<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> axolotl<span class="pp">[</span><span class="ss">deepspeed</span><span class="pp">]</span></span>
-<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="co"># recommended - install cut-cross-entropy</span></span>
-<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="st">"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@main"</span></span>
-<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a><span class="co"># (optional) - prefetch flash-attn2 and causal-conv1d kernels</span></span>
-<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> run <span class="at">--python</span> 3.12 python <span class="at">-c</span> <span class="st">"from kernels import get_kernel; get_kernel('kernels-community/flash-attn2'); get_kernel('kernels-community/causal-conv1d')"</span></span>
-<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a><span class="co"># Download example axolotl configs, deepspeed configs</span></span>
-<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch examples</span>
-<span id="cb1-23"><a href="#cb1-23" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch deepspeed_configs  <span class="co"># OPTIONAL</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-</section>
-<section id="using-pip" class="level4">
-<h4 class="anchored" data-anchor-id="using-pip">Using pip</h4>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">-U</span> packaging==26.0 setuptools==75.8.0 wheel ninja</span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> axolotl<span class="pp">[</span><span class="ss">flash</span><span class="pp">-</span><span class="ss">attn,deepspeed</span><span class="pp">]</span></span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Download example axolotl configs, deepspeed configs</span></span>
-<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch examples</span>
-<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch deepspeed_configs  <span class="co"># OPTIONAL</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
-</section>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="co"># change depending on system</span></span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="bu">export</span> <span class="va">UV_TORCH_BACKEND</span><span class="op">=</span>cu128</span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="co"># create a new virtual environment</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> venv <span class="at">--python</span> 3.12</span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="bu">source</span> .venv/bin/activate</span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install torch==2.10.0 torchvision</span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="ex">uv</span> pip install <span class="at">--no-build-isolation</span> axolotl<span class="pp">[</span><span class="ss">deepspeed</span><span class="pp">]</span></span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="co"># Download example axolotl configs, deepspeed configs</span></span>
+<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch examples</span>
+<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch deepspeed_configs  <span class="co"># OPTIONAL</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <section id="using-docker" class="level4">
 <h4 class="anchored" data-anchor-id="using-docker">Using Docker</h4>
 <p>Installing with Docker can be less error prone than installing in your own environment.</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="ex">docker</span> run <span class="at">--gpus</span> <span class="st">'"all"'</span> <span class="at">--rm</span> <span class="at">-it</span> axolotlai/axolotl:main-latest</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">docker</span> run <span class="at">--gpus</span> <span class="st">'"all"'</span> <span class="at">--ipc</span><span class="op">=</span>host <span class="at">--rm</span> <span class="at">-it</span> axolotlai/axolotl:main-latest</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <p>Other installation approaches are described <a href="https://docs.axolotl.ai/docs/installation.html">here</a>.</p>
 </section>
 <section id="cloud-providers" class="level4">
@@ -978,14 +953,14 @@ Expand older updates
 </section>
 <section id="your-first-fine-tune" class="level3">
 <h3 class="anchored" data-anchor-id="your-first-fine-tune">Your First Fine-tune</h3>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Fetch axolotl examples</span></span>
-<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch examples</span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Or, specify a custom path</span></span>
-<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch examples <span class="at">--dest</span> path/to/folder</span>
-<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Train a model using LoRA</span></span>
-<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train examples/llama-3/lora-1b.yml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Fetch axolotl examples</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch examples</span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Or, specify a custom path</span></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> fetch examples <span class="at">--dest</span> path/to/folder</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Train a model using LoRA</span></span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> train examples/llama-3/lora-1b.yml</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <p>That’s it! Check out our <a href="https://docs.axolotl.ai/docs/getting-started.html">Getting Started Guide</a> for a more detailed walkthrough.</p>
 </section>
 </section>
@@ -1006,20 +981,20 @@ Expand older updates
 <section id="ai-agent-support" class="level2">
 <h2 class="anchored" data-anchor-id="ai-agent-support">AI Agent Support</h2>
 <p>Axolotl ships with built-in documentation optimized for AI coding agents (Claude Code, Cursor, Copilot, etc.). These docs are bundled with the pip package — no repo clone needed.</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Show overview and available training methods</span></span>
-<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs</span>
-<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Topic-specific references</span></span>
-<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs sft                 <span class="co"># supervised fine-tuning</span></span>
-<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs grpo                <span class="co"># GRPO online RL</span></span>
-<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs preference_tuning   <span class="co"># DPO, KTO, ORPO, SimPO</span></span>
-<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs reward_modelling    <span class="co"># outcome and process reward models</span></span>
-<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs pretraining         <span class="co"># continual pretraining</span></span>
-<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs <span class="at">--list</span>              <span class="co"># list all topics</span></span>
-<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Dump config schema for programmatic use</span></span>
-<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> config-schema</span>
-<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> config-schema <span class="at">--field</span> adapter</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Show overview and available training methods</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Topic-specific references</span></span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs sft                 <span class="co"># supervised fine-tuning</span></span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs grpo                <span class="co"># GRPO online RL</span></span>
+<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs preference_tuning   <span class="co"># DPO, KTO, ORPO, SimPO</span></span>
+<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs reward_modelling    <span class="co"># outcome and process reward models</span></span>
+<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs pretraining         <span class="co"># continual pretraining</span></span>
+<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> agent-docs <span class="at">--list</span>              <span class="co"># list all topics</span></span>
+<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Dump config schema for programmatic use</span></span>
+<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> config-schema</span>
+<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a><span class="ex">axolotl</span> config-schema <span class="at">--field</span> adapter</span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 <p>If you’re working with the source repo, agent docs are also available at <code>docs/agents/</code> and the project overview is in <code>AGENTS.md</code>.</p>
 </section>
 <section id="getting-help" class="level2">
@@ -1049,13 +1024,13 @@ disable it, set AXOLOTL_DO_NOT_TRACK=1. For more details, see our <a href="https
 <section id="citing-axolotl" class="level2">
 <h2 class="anchored" data-anchor-id="citing-axolotl">📝 Citing Axolotl</h2>
 <p>If you use Axolotl in your research or projects, please cite it as follows:</p>
-<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb6"><pre class="sourceCode bibtex code-with-copy"><code class="sourceCode bibtex"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co">@software{axolotl,</span></span>
-<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co">  title = {Axolotl: Open Source LLM Post-Training},</span></span>
-<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="co">  author = {{Axolotl maintainers and contributors}},</span></span>
-<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="co">  url = {https://github.com/axolotl-ai-cloud/axolotl},</span></span>
-<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="co">  license = {Apache-2.0},</span></span>
-<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="co">  year = {2023}</span></span>
-<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="co">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
+<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode bibtex code-with-copy"><code class="sourceCode bibtex"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co">@software{axolotl,</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="co">  title = {Axolotl: Open Source LLM Post-Training},</span></span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="co">  author = {{Axolotl maintainers and contributors}},</span></span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co">  url = {https://github.com/axolotl-ai-cloud/axolotl},</span></span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="co">  license = {Apache-2.0},</span></span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="co">  year = {2023}</span></span>
+<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a><span class="co">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
 </section>
 <section id="license" class="level2">
 <h2 class="anchored" data-anchor-id="license">📜 License</h2>
diff --git a/search.json b/search.json
index 103c9affe..2f40c83e8 100644
--- a/search.json
+++ b/search.json
@@ -56,7 +56,7 @@
     "href": "docs/models/seed-oss.html#getting-started",
     "title": "Seed-OSS",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\npip3 install packaging setuptools wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\n# Install Cut Cross Entropy\npython scripts/cutcrossentropy_install.py | sh\nRun the finetuning example:\n\naxolotl train examples/seed-oss/seed-oss-36b-qlora.yaml\nThis config uses about 27.7 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official Seed Team recommends top_p=0.95 and temperature=1.1.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\nuv pip install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\n# Install Cut Cross Entropy\npython scripts/cutcrossentropy_install.py | sh\nRun the finetuning example:\n\naxolotl train examples/seed-oss/seed-oss-36b-qlora.yaml\nThis config uses about 27.7 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official Seed Team recommends top_p=0.95 and temperature=1.1.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -104,7 +104,7 @@
     "href": "docs/models/internvl3_5.html#getting-started",
     "title": "InternVL 3.5",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall timm for vision model support:\npip install timm==1.0.19\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/internvl3_5/internvl3_5-8b-qlora.yml\n\nThis config uses about 8.21 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀\n\nTips\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the multi-modal format as seen here.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall timm for vision model support:\nuv pip install timm==1.0.19\nInstall Cut Cross Entropy to reduce training VRAM usage.\nRun the finetuning example:\naxolotl train examples/internvl3_5/internvl3_5-8b-qlora.yml\n\nThis config uses about 8.21 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀\n\nTips\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the multi-modal format as seen here.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -152,7 +152,7 @@
     "href": "docs/models/apertus.html#getting-started",
     "title": "Apertus",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as Apertus is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\n(Optional, highly recommended) Install XIELU CUDA\n\n## Recommended for reduced VRAM and faster speeds\n\n# Point to CUDA toolkit directory\n# For those using our Docker image, use the below path.\nexport CUDA_HOME=/usr/local/cuda\n\npip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps\nFor any installation errors, see XIELU Installation Issues\n\nRun the finetuning example:\n\naxolotl train examples/apertus/apertus-8b-qlora.yaml\nThis config uses about 8.7 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTips\n\nFor inference, the official Apertus team recommends top_p=0.9 and temperature=0.8.\nYou can instead use full paremter fine-tuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\n\n\n\nXIELU Installation Issues\n\nModuleNotFoundError: No module named 'torch'\nPlease check these one by one:\n- Running in correct environment\n- Env has PyTorch installed\n- CUDA toolkit is at CUDA_HOME\nIf those didn’t help, please try the below solutions:\n\nPass env for CMAKE and try install again:\nPython_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps\nGit clone the repo and manually hardcode python path:\ngit clone https://github.com/nickjbrowning/XIELU\ncd xielu\ngit checkout 59d6031\n\ncd xielu\nnano CMakeLists.txt  # or vi depending on your preference\nexecute_process(\n-    COMMAND ${Python_EXECUTABLE} -c \"import torch.utils; print(torch.utils.cmake_prefix_path)\"\n+    COMMAND /root/miniconda3/envs/py3.11/bin/python -c \"import torch.utils; print(torch.utils.cmake_prefix_path)\"\n    RESULT_VARIABLE TORCH_CMAKE_PATH_RESULT\n    OUTPUT_VARIABLE TORCH_CMAKE_PATH_OUTPUT\n    ERROR_VARIABLE TORCH_CMAKE_PATH_ERROR\n)\npip3 install . --no-build-isolation --no-deps",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as Apertus is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\nuv pip install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\n(Optional, highly recommended) Install XIELU CUDA\n\n## Recommended for reduced VRAM and faster speeds\n\n# Point to CUDA toolkit directory\n# For those using our Docker image, use the below path.\nexport CUDA_HOME=/usr/local/cuda\n\nuv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps\nFor any installation errors, see XIELU Installation Issues\n\nRun the finetuning example:\n\naxolotl train examples/apertus/apertus-8b-qlora.yaml\nThis config uses about 8.7 GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTips\n\nFor inference, the official Apertus team recommends top_p=0.9 and temperature=0.8.\nYou can instead use full paremter fine-tuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\n\n\n\nXIELU Installation Issues\n\nModuleNotFoundError: No module named 'torch'\nPlease check these one by one:\n- Running in correct environment\n- Env has PyTorch installed\n- CUDA toolkit is at CUDA_HOME\nIf those didn’t help, please try the below solutions:\n\nPass env for CMAKE and try install again:\nPython_EXECUTABLE=$(which python) uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps\nGit clone the repo and manually hardcode python path:\ngit clone https://github.com/nickjbrowning/XIELU\ncd xielu\ngit checkout 59d6031\n\ncd xielu\nnano CMakeLists.txt  # or vi depending on your preference\nexecute_process(\n-    COMMAND ${Python_EXECUTABLE} -c \"import torch.utils; print(torch.utils.cmake_prefix_path)\"\n+    COMMAND /root/miniconda3/envs/py3.11/bin/python -c \"import torch.utils; print(torch.utils.cmake_prefix_path)\"\n    RESULT_VARIABLE TORCH_CMAKE_PATH_RESULT\n    OUTPUT_VARIABLE TORCH_CMAKE_PATH_OUTPUT\n    ERROR_VARIABLE TORCH_CMAKE_PATH_ERROR\n)\nuv pip install . --no-build-isolation --no-deps",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -212,7 +212,7 @@
     "href": "docs/models/smolvlm2.html#getting-started",
     "title": "SmolVLM 2",
     "section": "Getting Started",
-    "text": "Getting Started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\npip3 install packaging setuptools wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\nInstall an extra dependency:\npip3 install num2words==0.5.14\nRun the finetuning example:\n# LoRA SFT (1x48GB @ 6.8GiB)\naxolotl train examples/smolvlm2/smolvlm2-2B-lora.yaml",
+    "text": "Getting Started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\nuv pip install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\nInstall an extra dependency:\nuv pip install num2words==0.5.14\nRun the finetuning example:\n# LoRA SFT (1x48GB @ 6.8GiB)\naxolotl train examples/smolvlm2/smolvlm2-2B-lora.yaml",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -272,7 +272,7 @@
     "href": "docs/models/arcee.html#getting-started",
     "title": "Arcee AFM",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as AFM is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/arcee/afm-4.5b-qlora.yaml\nThis config uses about 7.8GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official Arcee.ai team recommends top_p: 0.95, temperature: 0.5, top_k: 50, and repeat_penalty: 1.1.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as AFM is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\nuv pip install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/arcee/afm-4.5b-qlora.yaml\nThis config uses about 7.8GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official Arcee.ai team recommends top_p: 0.95, temperature: 0.5, top_k: 50, and repeat_penalty: 1.1.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -334,7 +334,7 @@
     "href": "docs/models/ministral3/vision.html#getting-started",
     "title": "Ministral 3 Vision",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall the required vision lib:\nbash  pip install 'mistral-common[opencv]==1.8.6'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/ministral3/vision/ministral3-3b-vision-qlora.yml\n\nWARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTips\nKey differences from text-only model:\n- Multi-modal dataset format required\n- Sample packing not supported",
+    "text": "Getting started\n\nInstall the required vision lib:\nbash  uv pip install 'mistral-common[opencv]==1.8.6'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/ministral3/vision/ministral3-3b-vision-qlora.yml\n\nWARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTips\nKey differences from text-only model:\n- Multi-modal dataset format required\n- Sample packing not supported",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -565,7 +565,7 @@
     "href": "docs/models/hunyuan.html#getting-started",
     "title": "Hunyuan",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as HunYuan is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/hunyuan/hunyuan-v1-dense-qlora.yaml\nThis config uses about 4.7 GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nDataset\nHunYuan Instruct models can choose to enter a slow think or fast think pattern. For best performance on fine-tuning their Instruct models, your dataset should be adjusted to match their pattern.\n# fast think pattern\nmessages = [\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"/no_think What color is the sun?\" },\n    {\"role\": \"assistant\", \"content\": \"&lt;think&gt;\\n\\n&lt;/think&gt;\\n&lt;answer&gt;\\nThe sun is yellow.\\n&lt;/answer&gt;\"}\n]\n\n# slow think pattern\nmessages = [\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"/no_think What color is the sun?\" },\n    {\"role\": \"assistant\", \"content\": \"&lt;think&gt;\\nThe user is asking about the color of the sun. I need to ...\\n&lt;/think&gt;\\n&lt;answer&gt;\\nThe sun is yellow.\\n&lt;/answer&gt;\"}\n]\n\n\nTIPS\n\nFor inference, the official Tencent team recommends\n\n\n{\n  \"do_sample\": true,\n  \"top_k\": 20,\n  \"top_p\": 0.8,\n  \"repetition_penalty\": 1.05,\n  \"temperature\": 0.7\n}\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as HunYuan is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\nuv pip install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/hunyuan/hunyuan-v1-dense-qlora.yaml\nThis config uses about 4.7 GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nDataset\nHunYuan Instruct models can choose to enter a slow think or fast think pattern. For best performance on fine-tuning their Instruct models, your dataset should be adjusted to match their pattern.\n# fast think pattern\nmessages = [\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"/no_think What color is the sun?\" },\n    {\"role\": \"assistant\", \"content\": \"&lt;think&gt;\\n\\n&lt;/think&gt;\\n&lt;answer&gt;\\nThe sun is yellow.\\n&lt;/answer&gt;\"}\n]\n\n# slow think pattern\nmessages = [\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n    {\"role\": \"user\", \"content\": \"/no_think What color is the sun?\" },\n    {\"role\": \"assistant\", \"content\": \"&lt;think&gt;\\nThe user is asking about the color of the sun. I need to ...\\n&lt;/think&gt;\\n&lt;answer&gt;\\nThe sun is yellow.\\n&lt;/answer&gt;\"}\n]\n\n\nTIPS\n\nFor inference, the official Tencent team recommends\n\n\n{\n  \"do_sample\": true,\n  \"top_k\": 20,\n  \"top_p\": 0.8,\n  \"repetition_penalty\": 1.05,\n  \"temperature\": 0.7\n}\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -685,7 +685,7 @@
     "href": "docs/models/gemma3n.html#getting-started",
     "title": "Gemma 3n",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nIn addition to Axolotl’s requirements, Gemma-3n requires:\n\npip3 install timm==1.0.17\n\n# for loading audio data\npip3 install librosa==0.11.0\n\nDownload sample dataset files\n\n# for text + vision + audio only\nwget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/African_elephant.jpg\nwget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/En-us-African_elephant.oga\n\nRun the finetuning example:\n\n# text only\naxolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml\n\n# text + vision\naxolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml\n\n# text + vision + audio\naxolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml\nLet us know how it goes. Happy finetuning! 🚀\nWARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTIPS\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\nThe multimodal dataset format follows the OpenAI multi-content Messages format as seen here.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\nuv pip install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nIn addition to Axolotl’s requirements, Gemma-3n requires:\n\nuv pip install timm==1.0.17\n\n# for loading audio data\nuv pip install librosa==0.11.0\n\nDownload sample dataset files\n\n# for text + vision + audio only\nwget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/African_elephant.jpg\nwget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/En-us-African_elephant.oga\n\nRun the finetuning example:\n\n# text only\naxolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml\n\n# text + vision\naxolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml\n\n# text + vision + audio\naxolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml\nLet us know how it goes. Happy finetuning! 🚀\nWARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTIPS\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\nThe multimodal dataset format follows the OpenAI multi-content Messages format as seen here.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -733,7 +733,7 @@
     "href": "docs/models/devstral.html#getting-started",
     "title": "Devstral",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nInstall Cut Cross Entropy to reduce training VRAM usage\n\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/devstral/devstral-small-qlora.yml\nThis config uses about 21GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\nLearn how to use function calling with Axolotl at docs.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\nuv pip install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nInstall Cut Cross Entropy to reduce training VRAM usage\n\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/devstral/devstral-small-qlora.yml\nThis config uses about 21GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\nLearn how to use function calling with Axolotl at docs.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -857,7 +857,7 @@
     "href": "docs/models/qwen3-next.html#getting-started",
     "title": "Qwen 3 Next",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nInstall FLA for improved performance\n\npip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1\n\nRun the finetuning example:\n\naxolotl train examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml\nThis config uses about ~47 GiB (no target experts) and ~71GiB (target experts) VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, you can experiment with temperature: 0.7, top_p: 0.8, top_k: 20, and min_p: 0.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config. See Multi-GPU section below.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nInstall FLA for improved performance\n\nuv pip uninstall causal-conv1d && uv pip install flash-linear-attention==0.4.1\n\nRun the finetuning example:\n\naxolotl train examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml\nThis config uses about ~47 GiB (no target experts) and ~71GiB (target experts) VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, you can experiment with temperature: 0.7, top_p: 0.8, top_k: 20, and min_p: 0.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config. See Multi-GPU section below.\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -1455,857 +1455,896 @@
     "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoCometCallback\nCallback to save axolotl config to comet\n\n\n\n\n\nutils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(axolotl_config_path)\nCallback to save axolotl config to comet"
   },
   {
-    "objectID": "docs/api/utils.callbacks.profiler.html",
-    "href": "docs/api/utils.callbacks.profiler.html",
-    "title": "utils.callbacks.profiler",
+    "objectID": "docs/api/prompt_strategies.base.html",
+    "href": "docs/api/prompt_strategies.base.html",
+    "title": "prompt_strategies.base",
     "section": "",
-    "text": "utils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\n\n\n\nName\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\nAlso runs torch.profiler to produce a Chrome trace for timing analysis."
+    "text": "prompt_strategies.base\nprompt_strategies.base\nmodule for base dataset transform strategies"
   },
   {
-    "objectID": "docs/api/utils.callbacks.profiler.html#classes",
-    "href": "docs/api/utils.callbacks.profiler.html#classes",
-    "title": "utils.callbacks.profiler",
+    "objectID": "docs/api/index.html",
+    "href": "docs/api/index.html",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\nAlso runs torch.profiler to produce a Chrome trace for timing analysis."
+    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing dataset functionality.\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl.\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\n\n\n\n\n\n\nCommand-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.art\nAxolotl ASCII logo utils.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.quantize\nCLI to post-training quantize a model using torchao\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\ncli.utils\nInit for axolotl.cli.utils module.\n\n\ncli.utils.args\nUtilities for axolotl CLI args.\n\n\ncli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\ncli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\ncli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils.train\nUtilities for axolotl train CLI command.\n\n\n\n\n\n\nTraining implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL RL trainers\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers\n\n\n\n\n\n\nFunctionality for loading and patching models, tokenizers, etc.\n\n\n\nloaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\nloaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\nloaders.processor\nProcessor loading functionality for multi-modal models\n\n\nloaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\nloaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\n\n\nloaders.constants\nShared constants for axolotl.loaders module\n\n\n\n\n\n\nMixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\n\nContext managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\n\nPrompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\n\nLow-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes and FP8 integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules.\n\n\n\n\n\n\nRuntime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nmonkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\n\nUtility functions\n\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nUtilities for distributed functionality.\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.streaming\nData handling specific to streaming datasets.\n\n\nutils.data.sft\nData handling specific to SFT.\n\n\nutils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\n\nPydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\n\nThird-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\n\nCommon utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities.\n\n\n\n\n\n\nCustom model implementations\n\n\n\nmodels.mamba.modeling_mamba\n\n\n\n\n\n\n\nData processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\n\n\n\n\n\n\nTraining callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks\n\n\nutils.callbacks.qat\nQAT Callback for HF Causal Trainer"
   },
   {
-    "objectID": "docs/api/monkeypatch.utils.html",
-    "href": "docs/api/monkeypatch.utils.html",
-    "title": "monkeypatch.utils",
+    "objectID": "docs/api/index.html#core",
+    "href": "docs/api/index.html#core",
+    "title": "API Reference",
     "section": "",
-    "text": "monkeypatch.utils\nShared utils for the monkeypatches\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids"
+    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing dataset functionality.\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl.\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the"
   },
   {
-    "objectID": "docs/api/monkeypatch.utils.html#functions",
-    "href": "docs/api/monkeypatch.utils.html#functions",
-    "title": "monkeypatch.utils",
+    "objectID": "docs/api/index.html#cli",
+    "href": "docs/api/index.html#cli",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids"
+    "text": "Command-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.art\nAxolotl ASCII logo utils.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.quantize\nCLI to post-training quantize a model using torchao\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\ncli.utils\nInit for axolotl.cli.utils module.\n\n\ncli.utils.args\nUtilities for axolotl CLI args.\n\n\ncli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\ncli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\ncli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils.train\nUtilities for axolotl train CLI command."
   },
   {
-    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html",
-    "href": "docs/api/prompt_strategies.orpo.chat_template.html",
-    "title": "prompt_strategies.orpo.chat_template",
+    "objectID": "docs/api/index.html#trainers",
+    "href": "docs/api/index.html#trainers",
+    "title": "API Reference",
     "section": "",
-    "text": "prompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\n\n\n\nName\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
+    "text": "Training implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL RL trainers\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers"
   },
   {
-    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
-    "href": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
-    "title": "prompt_strategies.orpo.chat_template",
+    "objectID": "docs/api/index.html#model-loading",
+    "href": "docs/api/index.html#model-loading",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels"
+    "text": "Functionality for loading and patching models, tokenizers, etc.\n\n\n\nloaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\nloaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\nloaders.processor\nProcessor loading functionality for multi-modal models\n\n\nloaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\nloaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\n\n\nloaders.constants\nShared constants for axolotl.loaders module"
   },
   {
-    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
-    "href": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
-    "title": "prompt_strategies.orpo.chat_template",
+    "objectID": "docs/api/index.html#mixins",
+    "href": "docs/api/index.html#mixins",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
+    "text": "Mixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin"
   },
   {
-    "objectID": "docs/api/utils.chat_templates.html",
-    "href": "docs/api/utils.chat_templates.html",
-    "title": "utils.chat_templates",
+    "objectID": "docs/api/index.html#context-managers",
+    "href": "docs/api/index.html#context-managers",
+    "title": "API Reference",
     "section": "",
-    "text": "utils.chat_templates\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\nThese templates are used for formatting messages in a conversation."
+    "text": "Context managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.chatml.html",
-    "href": "docs/api/prompt_strategies.dpo.chatml.html",
-    "title": "prompt_strategies.dpo.chatml",
+    "objectID": "docs/api/index.html#prompt-strategies",
+    "href": "docs/api/index.html#prompt-strategies",
+    "title": "API Reference",
     "section": "",
-    "text": "prompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "Prompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.chatml.html#functions",
-    "href": "docs/api/prompt_strategies.dpo.chatml.html#functions",
-    "title": "prompt_strategies.dpo.chatml",
+    "objectID": "docs/api/index.html#kernels",
+    "href": "docs/api/index.html#kernels",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "Low-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes and FP8 integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules."
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
-    "title": "monkeypatch.gradient_checkpointing.offload_disk",
+    "objectID": "docs/api/index.html#monkey-patches",
+    "href": "docs/api/index.html#monkey-patches",
+    "title": "API Reference",
     "section": "",
-    "text": "monkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\nName\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
+    "text": "Runtime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nmonkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching"
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
-    "title": "monkeypatch.gradient_checkpointing.offload_disk",
+    "objectID": "docs/api/index.html#utils",
+    "href": "docs/api/index.html#utils",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
+    "text": "Utility functions\n\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nUtilities for distributed functionality.\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.streaming\nData handling specific to streaming datasets.\n\n\nutils.data.sft\nData handling specific to SFT.\n\n\nutils.quantization\nUtilities for quantization including QAT and PTQ using torchao."
   },
   {
-    "objectID": "docs/api/cli.checks.html",
-    "href": "docs/api/cli.checks.html",
-    "title": "cli.checks",
+    "objectID": "docs/api/index.html#schemas",
+    "href": "docs/api/index.html#schemas",
+    "title": "API Reference",
     "section": "",
-    "text": "cli.checks\nVarious checks for Axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
+    "text": "Pydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models"
   },
   {
-    "objectID": "docs/api/cli.checks.html#functions",
-    "href": "docs/api/cli.checks.html#functions",
-    "title": "cli.checks",
+    "objectID": "docs/api/index.html#integrations",
+    "href": "docs/api/index.html#integrations",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
+    "text": "Third-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments."
   },
   {
-    "objectID": "docs/api/utils.schemas.utils.html",
-    "href": "docs/api/utils.schemas.utils.html",
-    "title": "utils.schemas.utils",
+    "objectID": "docs/api/index.html#common",
+    "href": "docs/api/index.html#common",
+    "title": "API Reference",
     "section": "",
-    "text": "utils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
+    "text": "Common utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities."
   },
   {
-    "objectID": "docs/api/utils.schemas.utils.html#functions",
-    "href": "docs/api/utils.schemas.utils.html#functions",
-    "title": "utils.schemas.utils",
+    "objectID": "docs/api/index.html#models",
+    "href": "docs/api/index.html#models",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
+    "text": "Custom model implementations\n\n\n\nmodels.mamba.modeling_mamba"
   },
   {
-    "objectID": "docs/api/monkeypatch.lora_kernels.html",
-    "href": "docs/api/monkeypatch.lora_kernels.html",
-    "title": "monkeypatch.lora_kernels",
+    "objectID": "docs/api/index.html#data-processing",
+    "href": "docs/api/index.html#data-processing",
+    "title": "API Reference",
     "section": "",
-    "text": "monkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\n\n\n\nName\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\noriginal_apply_qkv_optional_v\nQKV projection for models where v_proj may be None (e.g. Gemma4 attention_k_eq_v).\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv_optional_v(self, hidden_states)\nQKV projection for models where v_proj may be None (e.g. Gemma4 attention_k_eq_v).\nWhen v_proj is None, key_states are reused as value_states.\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
+    "text": "Data processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences"
   },
   {
-    "objectID": "docs/api/monkeypatch.lora_kernels.html#classes",
-    "href": "docs/api/monkeypatch.lora_kernels.html#classes",
-    "title": "monkeypatch.lora_kernels",
+    "objectID": "docs/api/index.html#callbacks",
+    "href": "docs/api/index.html#callbacks",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching"
+    "text": "Training callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks\n\n\nutils.callbacks.qat\nQAT Callback for HF Causal Trainer"
   },
   {
-    "objectID": "docs/api/monkeypatch.lora_kernels.html#functions",
-    "href": "docs/api/monkeypatch.lora_kernels.html#functions",
-    "title": "monkeypatch.lora_kernels",
+    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html",
+    "href": "docs/api/core.trainers.mixins.rng_state_loader.html",
+    "title": "core.trainers.mixins.rng_state_loader",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\noriginal_apply_qkv_optional_v\nQKV projection for models where v_proj may be None (e.g. Gemma4 attention_k_eq_v).\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv_optional_v(self, hidden_states)\nQKV projection for models where v_proj may be None (e.g. Gemma4 attention_k_eq_v).\nWhen v_proj is None, key_states are reused as value_states.\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
+    "text": "core.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\nSee https://github.com/huggingface/transformers/pull/37162\nTODO: Remove when upstream added PR to release\n\n\n\n\n\nName\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
   },
   {
-    "objectID": "docs/api/utils.schemas.model.html",
-    "href": "docs/api/utils.schemas.model.html",
-    "title": "utils.schemas.model",
+    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
+    "href": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
+    "title": "core.trainers.mixins.rng_state_loader",
     "section": "",
-    "text": "utils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
+    "text": "Name\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
   },
   {
-    "objectID": "docs/api/utils.schemas.model.html#classes",
-    "href": "docs/api/utils.schemas.model.html#classes",
-    "title": "utils.schemas.model",
+    "objectID": "docs/api/cli.cloud.base.html",
+    "href": "docs/api/cli.cloud.base.html",
+    "title": "cli.cloud.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
+    "text": "cli.cloud.base\nbase class for cloud platforms from cli\n\n\n\n\n\nName\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
   },
   {
-    "objectID": "docs/api/prompt_strategies.pygmalion.html",
-    "href": "docs/api/prompt_strategies.pygmalion.html",
-    "title": "prompt_strategies.pygmalion",
+    "objectID": "docs/api/cli.cloud.base.html#classes",
+    "href": "docs/api/cli.cloud.base.html#classes",
+    "title": "cli.cloud.base",
     "section": "",
-    "text": "prompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
+    "text": "Name\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
   },
   {
-    "objectID": "docs/api/prompt_strategies.pygmalion.html#classes",
-    "href": "docs/api/prompt_strategies.pygmalion.html#classes",
-    "title": "prompt_strategies.pygmalion",
+    "objectID": "docs/api/core.builders.causal.html",
+    "href": "docs/api/core.builders.causal.html",
+    "title": "core.builders.causal",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
+    "text": "core.builders.causal\nBuilder for causal trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
   },
   {
-    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html",
-    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html",
-    "title": "prompt_strategies.bradley_terry.llama3",
+    "objectID": "docs/api/core.builders.causal.html#classes",
+    "href": "docs/api/core.builders.causal.html#classes",
+    "title": "core.builders.causal",
     "section": "",
-    "text": "prompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
+    "text": "Name\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
   },
   {
-    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
-    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
-    "title": "prompt_strategies.bradley_terry.llama3",
+    "objectID": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
+    "title": "monkeypatch.btlm_attn_hijack_flash",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
+    "text": "monkeypatch.btlm_attn_hijack_flash\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model"
   },
   {
-    "objectID": "docs/api/utils.optimizers.adopt.html",
-    "href": "docs/api/utils.optimizers.adopt.html",
-    "title": "utils.optimizers.adopt",
+    "objectID": "docs/api/utils.trainer.html",
+    "href": "docs/api/utils.trainer.html",
+    "title": "utils.trainer",
     "section": "",
-    "text": "utils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\nADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024)\nTaniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka\n\n\n\n\n\nName\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
+    "text": "utils.trainer\nModule containing the Trainer class and related functions\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\nfilter_sequences_by_length\nFilter sequences outside valid length range [min_sequence_len, sequence_len].\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.filter_sequences_by_length(\n    sample,\n    sequence_len=2048,\n    min_sequence_len=2,\n    raise_on_drop=False,\n)\nFilter sequences outside valid length range [min_sequence_len, sequence_len].\nDrops samples that are either too short (&lt; min_sequence_len) or too long (&gt; sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\nIf raise_on_drop is set, the code raises a ValueError if a sample is\nencountered that is too long and would have been dropped.\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
   },
   {
-    "objectID": "docs/api/utils.optimizers.adopt.html#functions",
-    "href": "docs/api/utils.optimizers.adopt.html#functions",
-    "title": "utils.optimizers.adopt",
+    "objectID": "docs/api/utils.trainer.html#functions",
+    "href": "docs/api/utils.trainer.html#functions",
+    "title": "utils.trainer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
+    "text": "Name\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\nfilter_sequences_by_length\nFilter sequences outside valid length range [min_sequence_len, sequence_len].\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.filter_sequences_by_length(\n    sample,\n    sequence_len=2048,\n    min_sequence_len=2,\n    raise_on_drop=False,\n)\nFilter sequences outside valid length range [min_sequence_len, sequence_len].\nDrops samples that are either too short (&lt; min_sequence_len) or too long (&gt; sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\nIf raise_on_drop is set, the code raises a ValueError if a sample is\nencountered that is too long and would have been dropped.\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
   },
   {
-    "objectID": "docs/api/utils.bench.html",
-    "href": "docs/api/utils.bench.html",
-    "title": "utils.bench",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
+    "title": "monkeypatch.llama_attn_hijack_xformers",
     "section": "",
-    "text": "utils.bench\nBenchmarking and measurement utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
+    "text": "monkeypatch.llama_attn_hijack_xformers\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments"
   },
   {
-    "objectID": "docs/api/utils.bench.html#functions",
-    "href": "docs/api/utils.bench.html#functions",
-    "title": "utils.bench",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
+    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
+    "text": "monkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\n\n\n\nName\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
   },
   {
-    "objectID": "docs/api/utils.callbacks.qat.html",
-    "href": "docs/api/utils.callbacks.qat.html",
-    "title": "utils.callbacks.qat",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
+    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
     "section": "",
-    "text": "utils.callbacks.qat\nQAT Callback for HF Causal Trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
   },
   {
-    "objectID": "docs/api/utils.callbacks.qat.html#classes",
-    "href": "docs/api/utils.callbacks.qat.html#classes",
-    "title": "utils.callbacks.qat",
+    "objectID": "docs/api/prompt_strategies.dpo.llama3.html",
+    "href": "docs/api/prompt_strategies.dpo.llama3.html",
+    "title": "prompt_strategies.dpo.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model."
+    "text": "prompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/utils.callbacks.qat.html#functions",
-    "href": "docs/api/utils.callbacks.qat.html#functions",
-    "title": "utils.callbacks.qat",
+    "objectID": "docs/api/prompt_strategies.dpo.llama3.html#functions",
+    "href": "docs/api/prompt_strategies.dpo.llama3.html#functions",
+    "title": "prompt_strategies.dpo.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/core.trainers.base.html",
-    "href": "docs/api/core.trainers.base.html",
-    "title": "core.trainers.base",
+    "objectID": "docs/api/monkeypatch.mixtral.html",
+    "href": "docs/api/monkeypatch.mixtral.html",
+    "title": "monkeypatch.mixtral",
     "section": "",
-    "text": "core.trainers.base\nModule for customized trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\nstore_metrics\nStore metrics with specified reduction type.\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.\n\n\n\ncore.trainers.base.AxolotlTrainer.store_metrics(\n    metrics,\n    train_eval='train',\n    reduction='mean',\n)\nStore metrics with specified reduction type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmetrics\ndict[str, float] | dict[str, tuple[int | float, str]]\nDictionary of metric names to values, or metric names to (value, reduction_type) tuples.\nrequired\n\n\ntrain_eval\nLiteral['train', 'eval']\nWhether this is for training or evaluation.\n'train'"
+    "text": "monkeypatch.mixtral\nmonkeypatch.mixtral\nPatches to support multipack for mixtral"
   },
   {
-    "objectID": "docs/api/core.trainers.base.html#classes",
-    "href": "docs/api/core.trainers.base.html#classes",
-    "title": "core.trainers.base",
+    "objectID": "docs/api/integrations.cut_cross_entropy.args.html",
+    "href": "docs/api/integrations.cut_cross_entropy.args.html",
+    "title": "integrations.cut_cross_entropy.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\nstore_metrics\nStore metrics with specified reduction type.\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.\n\n\n\ncore.trainers.base.AxolotlTrainer.store_metrics(\n    metrics,\n    train_eval='train',\n    reduction='mean',\n)\nStore metrics with specified reduction type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmetrics\ndict[str, float] | dict[str, tuple[int | float, str]]\nDictionary of metric names to values, or metric names to (value, reduction_type) tuples.\nrequired\n\n\ntrain_eval\nLiteral['train', 'eval']\nWhether this is for training or evaluation.\n'train'"
+    "text": "integrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.zephyr.html",
-    "href": "docs/api/prompt_strategies.dpo.zephyr.html",
-    "title": "prompt_strategies.dpo.zephyr",
+    "objectID": "docs/api/integrations.cut_cross_entropy.args.html#classes",
+    "href": "docs/api/integrations.cut_cross_entropy.args.html#classes",
+    "title": "integrations.cut_cross_entropy.args",
     "section": "",
-    "text": "prompt_strategies.dpo.zephyr\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr"
+    "text": "Name\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
   },
   {
-    "objectID": "docs/api/core.chat.messages.html",
-    "href": "docs/api/core.chat.messages.html",
-    "title": "core.chat.messages",
+    "objectID": "docs/api/integrations.spectrum.args.html",
+    "href": "docs/api/integrations.spectrum.args.html",
+    "title": "integrations.spectrum.args",
     "section": "",
-    "text": "core.chat.messages\ninternal message representations of chat messages\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
+    "text": "integrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
   },
   {
-    "objectID": "docs/api/core.chat.messages.html#classes",
-    "href": "docs/api/core.chat.messages.html#classes",
-    "title": "core.chat.messages",
+    "objectID": "docs/api/integrations.spectrum.args.html#classes",
+    "href": "docs/api/integrations.spectrum.args.html#classes",
+    "title": "integrations.spectrum.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
+    "text": "Name\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
   },
   {
-    "objectID": "docs/api/integrations.lm_eval.args.html",
-    "href": "docs/api/integrations.lm_eval.args.html",
-    "title": "integrations.lm_eval.args",
+    "objectID": "docs/api/prompt_strategies.dpo.user_defined.html",
+    "href": "docs/api/prompt_strategies.dpo.user_defined.html",
+    "title": "prompt_strategies.dpo.user_defined",
     "section": "",
-    "text": "integrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
+    "text": "prompt_strategies.dpo.user_defined\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies"
   },
   {
-    "objectID": "docs/api/integrations.lm_eval.args.html#classes",
-    "href": "docs/api/integrations.lm_eval.args.html#classes",
-    "title": "integrations.lm_eval.args",
+    "objectID": "docs/api/utils.data.sft.html",
+    "href": "docs/api/utils.data.sft.html",
+    "title": "utils.data.sft",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
+    "text": "utils.data.sft\nData handling specific to SFT.\n\n\n\n\n\nName\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
   },
   {
-    "objectID": "docs/api/core.trainers.mamba.html",
-    "href": "docs/api/core.trainers.mamba.html",
-    "title": "core.trainers.mamba",
+    "objectID": "docs/api/utils.data.sft.html#functions",
+    "href": "docs/api/utils.data.sft.html#functions",
+    "title": "utils.data.sft",
     "section": "",
-    "text": "core.trainers.mamba\nModule for mamba trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
+    "text": "Name\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
   },
   {
-    "objectID": "docs/api/core.trainers.mamba.html#classes",
-    "href": "docs/api/core.trainers.mamba.html#classes",
-    "title": "core.trainers.mamba",
+    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html",
+    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html",
+    "title": "monkeypatch.trainer_fsdp_optim",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
+    "text": "monkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
   },
   {
-    "objectID": "docs/api/train.html",
-    "href": "docs/api/train.html",
-    "title": "train",
+    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
+    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
+    "title": "monkeypatch.trainer_fsdp_optim",
     "section": "",
-    "text": "train\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
+    "text": "Name\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
   },
   {
-    "objectID": "docs/api/train.html#functions",
-    "href": "docs/api/train.html#functions",
-    "title": "train",
+    "objectID": "docs/api/utils.tokenization.html",
+    "href": "docs/api/utils.tokenization.html",
+    "title": "utils.tokenization",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
+    "text": "utils.tokenization\nModule for tokenization utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
   },
   {
-    "objectID": "docs/api/prompt_strategies.orcamini.html",
-    "href": "docs/api/prompt_strategies.orcamini.html",
-    "title": "prompt_strategies.orcamini",
+    "objectID": "docs/api/utils.tokenization.html#functions",
+    "href": "docs/api/utils.tokenization.html#functions",
+    "title": "utils.tokenization",
     "section": "",
-    "text": "prompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\nsee also https://huggingface.co/psmathur/orca_mini_v2_7b for more information\nUse dataset type: orcamini in conig.yml to use this prompt style.\nCompared to the alpaca_w_system.open_orca dataset type,\nthis one specifies the system prompt with “### System:”.\nNot suited/tested for multiple-turn conversations without further adjustments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
+    "text": "Name\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
   },
   {
-    "objectID": "docs/api/prompt_strategies.orcamini.html#classes",
-    "href": "docs/api/prompt_strategies.orcamini.html#classes",
-    "title": "prompt_strategies.orcamini",
+    "objectID": "docs/api/loaders.processor.html",
+    "href": "docs/api/loaders.processor.html",
+    "title": "loaders.processor",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
+    "text": "loaders.processor\nloaders.processor\nProcessor loading functionality for multi-modal models"
   },
   {
-    "objectID": "docs/api/cli.inference.html",
-    "href": "docs/api/cli.inference.html",
-    "title": "cli.inference",
+    "objectID": "docs/api/prompt_strategies.dpo.chat_template.html",
+    "href": "docs/api/prompt_strategies.dpo.chat_template.html",
+    "title": "prompt_strategies.dpo.chat_template",
     "section": "",
-    "text": "cli.inference\nCLI to run inference on a trained model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat\ntemplate is (optionally) applied, and the model specified in the axolotl config is\nused to generate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
+    "text": "prompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nDPO chat template strategy for argilla-style datasets.\n\n\n\n\n\nprompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)\nDPO chat template strategy for argilla-style datasets.\nFor argilla-style datasets where chosen/rejected contain full conversations\ninstead of single response messages. Extracts the conversation history from\nthe chosen field and formats both chosen/rejected responses using the\nconfigured chat template.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nConfiguration object containing chat_template and dataset settings\nrequired\n\n\ndataset_idx\n\nIndex of the dataset in the config (default: 0)\n0\n\n\n**kwargs\n\nAdditional keyword arguments (unused)\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ntuple\n\n(transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with ‘remove_columns’ specifying columns to drop\n\n\n\n\n\n\n{\n“chosen”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n],\n“rejected”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n]\n}"
   },
   {
-    "objectID": "docs/api/cli.inference.html#functions",
-    "href": "docs/api/cli.inference.html#functions",
-    "title": "cli.inference",
+    "objectID": "docs/api/prompt_strategies.dpo.chat_template.html#functions",
+    "href": "docs/api/prompt_strategies.dpo.chat_template.html#functions",
+    "title": "prompt_strategies.dpo.chat_template",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat\ntemplate is (optionally) applied, and the model specified in the axolotl config is\nused to generate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nDPO chat template strategy for argilla-style datasets.\n\n\n\n\n\nprompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)\nDPO chat template strategy for argilla-style datasets.\nFor argilla-style datasets where chosen/rejected contain full conversations\ninstead of single response messages. Extracts the conversation history from\nthe chosen field and formats both chosen/rejected responses using the\nconfigured chat template.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nConfiguration object containing chat_template and dataset settings\nrequired\n\n\ndataset_idx\n\nIndex of the dataset in the config (default: 0)\n0\n\n\n**kwargs\n\nAdditional keyword arguments (unused)\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ntuple\n\n(transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with ‘remove_columns’ specifying columns to drop\n\n\n\n\n\n\n{\n“chosen”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n],\n“rejected”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n]\n}"
   },
   {
-    "objectID": "docs/api/prompt_strategies.input_output.html",
-    "href": "docs/api/prompt_strategies.input_output.html",
-    "title": "prompt_strategies.input_output",
+    "objectID": "docs/api/core.datasets.transforms.chat_builder.html",
+    "href": "docs/api/core.datasets.transforms.chat_builder.html",
+    "title": "core.datasets.transforms.chat_builder",
     "section": "",
-    "text": "prompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\n\n\n\nName\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
+    "text": "core.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\ndataset and converts it to a Chat.\n\n\n\n\n\nName\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='messages',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “messages”.\n'messages'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
   },
   {
-    "objectID": "docs/api/prompt_strategies.input_output.html#classes",
-    "href": "docs/api/prompt_strategies.input_output.html#classes",
-    "title": "prompt_strategies.input_output",
+    "objectID": "docs/api/core.datasets.transforms.chat_builder.html#functions",
+    "href": "docs/api/core.datasets.transforms.chat_builder.html#functions",
+    "title": "core.datasets.transforms.chat_builder",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
+    "text": "Name\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='messages',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “messages”.\n'messages'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
   },
   {
-    "objectID": "docs/api/utils.data.streaming.html",
-    "href": "docs/api/utils.data.streaming.html",
-    "title": "utils.data.streaming",
+    "objectID": "docs/api/loaders.tokenizer.html",
+    "href": "docs/api/loaders.tokenizer.html",
+    "title": "loaders.tokenizer",
     "section": "",
-    "text": "utils.data.streaming\nutils.data.streaming\nData handling specific to streaming datasets."
+    "text": "loaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n    revision='main',\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\nrevision\nstr\nModel revision/branch/tag/commit to load from (HF Hub)\n'main'\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
   },
   {
-    "objectID": "docs/api/cli.cloud.modal_.html",
-    "href": "docs/api/cli.cloud.modal_.html",
-    "title": "cli.cloud.modal_",
+    "objectID": "docs/api/loaders.tokenizer.html#functions",
+    "href": "docs/api/loaders.tokenizer.html#functions",
+    "title": "loaders.tokenizer",
     "section": "",
-    "text": "cli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\nName\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
+    "text": "Name\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n    revision='main',\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\nrevision\nstr\nModel revision/branch/tag/commit to load from (HF Hub)\n'main'\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
   },
   {
-    "objectID": "docs/api/cli.cloud.modal_.html#classes",
-    "href": "docs/api/cli.cloud.modal_.html#classes",
-    "title": "cli.cloud.modal_",
+    "objectID": "docs/api/utils.callbacks.perplexity.html",
+    "href": "docs/api/utils.callbacks.perplexity.html",
+    "title": "utils.callbacks.perplexity",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation."
+    "text": "utils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
   },
   {
-    "objectID": "docs/api/cli.cloud.modal_.html#functions",
-    "href": "docs/api/cli.cloud.modal_.html#functions",
-    "title": "cli.cloud.modal_",
+    "objectID": "docs/api/utils.callbacks.perplexity.html#classes",
+    "href": "docs/api/utils.callbacks.perplexity.html#classes",
+    "title": "utils.callbacks.perplexity",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
+    "text": "Name\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
   },
   {
-    "objectID": "docs/api/utils.collators.mm_chat.html",
-    "href": "docs/api/utils.collators.mm_chat.html",
-    "title": "utils.collators.mm_chat",
+    "objectID": "docs/api/integrations.base.html",
+    "href": "docs/api/integrations.base.html",
+    "title": "integrations.base",
     "section": "",
-    "text": "utils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
+    "text": "integrations.base\nBase class for all plugins.\nA plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.\nPlugins can be used to integrate third-party models, modify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\non_rollouts_scored\nCalled after rollouts are scored during online RL (GRPO/PPO).\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntype[Trainer] | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.on_rollouts_scored(\n    cfg,\n    trainer,\n    prompts,\n    completions,\n    rewards,\n    advantages,\n)\nCalled after rollouts are scored during online RL (GRPO/PPO).\nProvides access to the full scored rollout data for logging, trace\nstorage, or analysis. Called once per scoring step with all samples\nfrom that step.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\ntrainer\n\nThe trainer instance.\nrequired\n\n\nprompts\nlist[str]\nList of prompt texts (one per sample).\nrequired\n\n\ncompletions\nlist[str]\nList of completion texts (one per sample).\nrequired\n\n\nrewards\ndict[str, list[float]]\nDict mapping reward function name to list of reward values.\nrequired\n\n\nadvantages\nlist[float]\nList of advantage values (one per sample).\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\non_rollouts_scored\nCalls the on_rollouts_scored method of all registered plugins.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.on_rollouts_scored(\n    cfg,\n    trainer,\n    prompts,\n    completions,\n    rewards,\n    advantages,\n)\nCalls the on_rollouts_scored method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\n\nThe trainer instance.\nrequired\n\n\nprompts\nlist[str]\nList of prompt texts.\nrequired\n\n\ncompletions\nlist[str]\nList of completion texts.\nrequired\n\n\nrewards\ndict[str, list[float]]\nDict mapping reward function name to list of rewards.\nrequired\n\n\nadvantages\nlist[float]\nList of advantage values.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported.\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/utils.collators.mm_chat.html#classes",
-    "href": "docs/api/utils.collators.mm_chat.html#classes",
-    "title": "utils.collators.mm_chat",
+    "objectID": "docs/api/integrations.base.html#classes",
+    "href": "docs/api/integrations.base.html#classes",
+    "title": "integrations.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
+    "text": "Name\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\non_rollouts_scored\nCalled after rollouts are scored during online RL (GRPO/PPO).\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntype[Trainer] | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.on_rollouts_scored(\n    cfg,\n    trainer,\n    prompts,\n    completions,\n    rewards,\n    advantages,\n)\nCalled after rollouts are scored during online RL (GRPO/PPO).\nProvides access to the full scored rollout data for logging, trace\nstorage, or analysis. Called once per scoring step with all samples\nfrom that step.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\ntrainer\n\nThe trainer instance.\nrequired\n\n\nprompts\nlist[str]\nList of prompt texts (one per sample).\nrequired\n\n\ncompletions\nlist[str]\nList of completion texts (one per sample).\nrequired\n\n\nrewards\ndict[str, list[float]]\nDict mapping reward function name to list of reward values.\nrequired\n\n\nadvantages\nlist[float]\nList of advantage values (one per sample).\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\non_rollouts_scored\nCalls the on_rollouts_scored method of all registered plugins.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.on_rollouts_scored(\n    cfg,\n    trainer,\n    prompts,\n    completions,\n    rewards,\n    advantages,\n)\nCalls the on_rollouts_scored method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\n\nThe trainer instance.\nrequired\n\n\nprompts\nlist[str]\nList of prompt texts.\nrequired\n\n\ncompletions\nlist[str]\nList of completion texts.\nrequired\n\n\nrewards\ndict[str, list[float]]\nDict mapping reward function name to list of rewards.\nrequired\n\n\nadvantages\nlist[float]\nList of advantage values.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/cli.config.html",
-    "href": "docs/api/cli.config.html",
-    "title": "cli.config",
+    "objectID": "docs/api/integrations.base.html#functions",
+    "href": "docs/api/integrations.base.html#functions",
+    "title": "integrations.base",
     "section": "",
-    "text": "cli.config\nConfiguration loading and processing.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/cli.config.html#functions",
-    "href": "docs/api/cli.config.html#functions",
-    "title": "cli.config",
+    "objectID": "docs/api/prompt_strategies.kto.user_defined.html",
+    "href": "docs/api/prompt_strategies.kto.user_defined.html",
+    "title": "prompt_strategies.kto.user_defined",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "prompt_strategies.kto.user_defined\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies"
   },
   {
-    "objectID": "docs/api/monkeypatch.multipack.html",
-    "href": "docs/api/monkeypatch.multipack.html",
-    "title": "monkeypatch.multipack",
+    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html",
+    "href": "docs/api/prompt_strategies.alpaca_w_system.html",
+    "title": "prompt_strategies.alpaca_w_system",
     "section": "",
-    "text": "monkeypatch.multipack\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing"
+    "text": "prompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\n\n\n\nName\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
   },
   {
-    "objectID": "docs/api/integrations.grokfast.optimizer.html",
-    "href": "docs/api/integrations.grokfast.optimizer.html",
-    "title": "integrations.grokfast.optimizer",
+    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
+    "href": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
+    "title": "prompt_strategies.alpaca_w_system",
     "section": "",
-    "text": "integrations.grokfast.optimizer\nintegrations.grokfast.optimizer"
+    "text": "Name\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
   },
   {
-    "objectID": "docs/api/core.trainers.trl.html",
-    "href": "docs/api/core.trainers.trl.html",
-    "title": "core.trainers.trl",
+    "objectID": "docs/api/models.mamba.modeling_mamba.html",
+    "href": "docs/api/models.mamba.modeling_mamba.html",
+    "title": "models.mamba.modeling_mamba",
     "section": "",
-    "text": "core.trainers.trl\nModule for TRL RL trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
+    "text": "models.mamba.modeling_mamba\nmodels.mamba.modeling_mamba"
   },
   {
-    "objectID": "docs/api/core.trainers.trl.html#classes",
-    "href": "docs/api/core.trainers.trl.html#classes",
-    "title": "core.trainers.trl",
+    "objectID": "docs/api/cli.train.html",
+    "href": "docs/api/cli.train.html",
+    "title": "cli.train",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
+    "text": "cli.train\nCLI to run training on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_chat.html",
-    "href": "docs/api/prompt_strategies.alpaca_chat.html",
-    "title": "prompt_strategies.alpaca_chat",
+    "objectID": "docs/api/cli.train.html#functions",
+    "href": "docs/api/cli.train.html#functions",
+    "title": "cli.train",
     "section": "",
-    "text": "prompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_chat.html#classes",
-    "href": "docs/api/prompt_strategies.alpaca_chat.html#classes",
-    "title": "prompt_strategies.alpaca_chat",
+    "objectID": "docs/api/utils.freeze.html",
+    "href": "docs/api/utils.freeze.html",
+    "title": "utils.freeze",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
+    "text": "utils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\n\n\n\nName\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\nfreeze_mm_modules\nFreeze all vision/audio/multimodal-projector parameters.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place.\n\n\n\nutils.freeze.freeze_mm_modules(model)\nFreeze all vision/audio/multimodal-projector parameters.\nIterates over model.named_parameters() and sets requires_grad = False\nfor any parameter whose name contains a known vision/audio module prefix.\nThis is useful when fine-tuning only the language backbone of a multimodal\nmodel and avoids the need for ddp_find_unused_parameters=True."
   },
   {
-    "objectID": "docs/api/core.builders.rl.html",
-    "href": "docs/api/core.builders.rl.html",
-    "title": "core.builders.rl",
+    "objectID": "docs/api/utils.freeze.html#classes",
+    "href": "docs/api/utils.freeze.html#classes",
+    "title": "utils.freeze",
     "section": "",
-    "text": "core.builders.rl\nBuilder for RLHF trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
+    "text": "Name\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise."
   },
   {
-    "objectID": "docs/api/core.builders.rl.html#classes",
-    "href": "docs/api/core.builders.rl.html#classes",
-    "title": "core.builders.rl",
+    "objectID": "docs/api/utils.freeze.html#functions",
+    "href": "docs/api/utils.freeze.html#functions",
+    "title": "utils.freeze",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
+    "text": "Name\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\nfreeze_mm_modules\nFreeze all vision/audio/multimodal-projector parameters.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place.\n\n\n\nutils.freeze.freeze_mm_modules(model)\nFreeze all vision/audio/multimodal-projector parameters.\nIterates over model.named_parameters() and sets requires_grad = False\nfor any parameter whose name contains a known vision/audio module prefix.\nThis is useful when fine-tuning only the language backbone of a multimodal\nmodel and avoids the need for ddp_find_unused_parameters=True."
   },
   {
-    "objectID": "docs/api/utils.schemas.trl.html",
-    "href": "docs/api/utils.schemas.trl.html",
-    "title": "utils.schemas.trl",
+    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html",
+    "href": "docs/api/prompt_strategies.stepwise_supervised.html",
+    "title": "prompt_strategies.stepwise_supervised",
     "section": "",
-    "text": "utils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
+    "text": "prompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\nand (optionally) per-step, or per-prompt-trace labels for reward modelling.\n\n\n\n\n\nName\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
   },
   {
-    "objectID": "docs/api/utils.schemas.trl.html#classes",
-    "href": "docs/api/utils.schemas.trl.html#classes",
-    "title": "utils.schemas.trl",
+    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
+    "href": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
+    "title": "prompt_strategies.stepwise_supervised",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
+    "text": "Name\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
   },
   {
-    "objectID": "docs/api/utils.collators.batching.html",
-    "href": "docs/api/utils.collators.batching.html",
-    "title": "utils.collators.batching",
+    "objectID": "docs/api/cli.delinearize_llama4.html",
+    "href": "docs/api/cli.delinearize_llama4.html",
+    "title": "cli.delinearize_llama4",
     "section": "",
-    "text": "utils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\n\n\n\nName\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
+    "text": "cli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
   },
   {
-    "objectID": "docs/api/utils.collators.batching.html#classes",
-    "href": "docs/api/utils.collators.batching.html#classes",
-    "title": "utils.collators.batching",
+    "objectID": "docs/api/cli.delinearize_llama4.html#functions",
+    "href": "docs/api/cli.delinearize_llama4.html#functions",
+    "title": "cli.delinearize_llama4",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
   },
   {
-    "objectID": "docs/api/utils.model_shard_quant.html",
-    "href": "docs/api/utils.model_shard_quant.html",
-    "title": "utils.model_shard_quant",
+    "objectID": "docs/api/prompt_strategies.dpo.passthrough.html",
+    "href": "docs/api/prompt_strategies.dpo.passthrough.html",
+    "title": "prompt_strategies.dpo.passthrough",
     "section": "",
-    "text": "utils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
+    "text": "prompt_strategies.dpo.passthrough\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy"
   },
   {
-    "objectID": "docs/api/utils.model_shard_quant.html#functions",
-    "href": "docs/api/utils.model_shard_quant.html#functions",
-    "title": "utils.model_shard_quant",
+    "objectID": "docs/api/cli.utils.load.html",
+    "href": "docs/api/cli.utils.load.html",
+    "title": "cli.utils.load",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
+    "text": "cli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
   },
   {
-    "objectID": "docs/api/integrations.kd.trainer.html",
-    "href": "docs/api/integrations.kd.trainer.html",
-    "title": "integrations.kd.trainer",
+    "objectID": "docs/api/cli.utils.load.html#functions",
+    "href": "docs/api/cli.utils.load.html#functions",
+    "title": "cli.utils.load",
     "section": "",
-    "text": "integrations.kd.trainer\nKD trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+    "text": "Name\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
   },
   {
-    "objectID": "docs/api/integrations.kd.trainer.html#classes",
-    "href": "docs/api/integrations.kd.trainer.html#classes",
-    "title": "integrations.kd.trainer",
+    "objectID": "docs/api/kernels.swiglu.html",
+    "href": "docs/api/kernels.swiglu.html",
+    "title": "kernels.swiglu",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+    "text": "kernels.swiglu\nModule for definition of SwiGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/utils.schemas.peft.html",
-    "href": "docs/api/utils.schemas.peft.html",
-    "title": "utils.schemas.peft",
+    "objectID": "docs/api/kernels.swiglu.html#functions",
+    "href": "docs/api/kernels.swiglu.html#functions",
+    "title": "kernels.swiglu",
     "section": "",
-    "text": "utils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
+    "text": "Name\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/utils.schemas.peft.html#classes",
-    "href": "docs/api/utils.schemas.peft.html#classes",
-    "title": "utils.schemas.peft",
+    "objectID": "docs/api/core.trainers.grpo.trainer.html",
+    "href": "docs/api/core.trainers.grpo.trainer.html",
+    "title": "core.trainers.grpo.trainer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
+    "text": "core.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlAsyncGRPOTrainer\nExtend AsyncGRPOTrainer with axolotl helpers\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlAsyncGRPOTrainer(*args, **kwargs)\nExtend AsyncGRPOTrainer with axolotl helpers\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
   },
   {
-    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html",
-    "href": "docs/api/utils.ctx_managers.sequence_parallel.html",
-    "title": "utils.ctx_managers.sequence_parallel",
+    "objectID": "docs/api/core.trainers.grpo.trainer.html#classes",
+    "href": "docs/api/core.trainers.grpo.trainer.html#classes",
+    "title": "core.trainers.grpo.trainer",
     "section": "",
-    "text": "utils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
+    "text": "Name\nDescription\n\n\n\n\nAxolotlAsyncGRPOTrainer\nExtend AsyncGRPOTrainer with axolotl helpers\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlAsyncGRPOTrainer(*args, **kwargs)\nExtend AsyncGRPOTrainer with axolotl helpers\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
   },
   {
-    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
-    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
-    "title": "utils.ctx_managers.sequence_parallel",
+    "objectID": "docs/api/common.const.html",
+    "href": "docs/api/common.const.html",
+    "title": "common.const",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired"
+    "text": "common.const\ncommon.const\nVarious shared constants"
   },
   {
-    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
-    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
-    "title": "utils.ctx_managers.sequence_parallel",
+    "objectID": "docs/api/datasets.html",
+    "href": "docs/api/datasets.html",
+    "title": "datasets",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
+    "text": "datasets\nModule containing dataset functionality.\nWe want this to be a wrapper for an existing dataset that we have loaded. Lets use the\nconcept of middlewares to wrap each dataset. We’ll use the collators later on to pad the\ndatasets.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
   },
   {
-    "objectID": "docs/api/cli.vllm_serve.html",
-    "href": "docs/api/cli.vllm_serve.html",
-    "title": "cli.vllm_serve",
+    "objectID": "docs/api/datasets.html#classes",
+    "href": "docs/api/datasets.html#classes",
+    "title": "datasets",
     "section": "",
-    "text": "cli.vllm_serve\nCLI to start the vllm server for online RL\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
+    "text": "Name\nDescription\n\n\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
   },
   {
-    "objectID": "docs/api/cli.vllm_serve.html#classes",
-    "href": "docs/api/cli.vllm_serve.html#classes",
-    "title": "cli.vllm_serve",
+    "objectID": "docs/api/kernels.lora.html",
+    "href": "docs/api/kernels.lora.html",
+    "title": "kernels.lora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server"
+    "text": "kernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\nSee “LoRA: Low-Rank Adaptation of Large Language Models”\n(https://arxiv.org/abs/2106.09685).\nAlso supports DoRA (Weight-Decomposed Low-Rank Adaptation):\nSee “DoRA: Weight-Decomposed Low-Rank Adaptation” (https://arxiv.org/abs/2402.09353).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoRA_Embedding\nFused LoRA embedding: F.embedding(x, W) + s * F.embedding(x, A^T) @ B^T.\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QK\nOptimized LoRA QK implementation for models where v_proj is None.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_Embedding()\nFused LoRA embedding: F.embedding(x, W) + s * F.embedding(x, A^T) @ B^T.\nSupports dropout and DoRA.\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\nSupports bias, dropout, and DoRA. Dropout is applied to the input for\ngate/up projections. The down projection uses hidden states (post-activation)\nas input, so dropout is not applied there.\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.LoRA_QK()\nOptimized LoRA QK implementation for models where v_proj is None.\nUsed by models like Gemma4 with attention_k_eq_v=True, where key states are\nreused as value states. Only Q and K projections are fused; the caller\nreturns K a second time as V so that autograd accumulates key+value gradients\ninto a single dK.\nSupports bias, dropout, and DoRA (Weight-Decomposed Low-Rank Adaptation).\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nSupports bias, dropout, and DoRA (Weight-Decomposed Low-Rank Adaptation).\nDropout is applied outside this Function so autograd handles its backward.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_embedding\nApplies LoRA to embedding layer.\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qk\nApplies LoRA to compute Query and Key projections for models where v_proj is None.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_embedding_lora_parameters\nExtract LoRA parameters from a PEFT Embedding module.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_embedding(self, x)\nApplies LoRA to embedding layer.\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_qk(self, X, inplace=True)\nApplies LoRA to compute Query and Key projections for models where v_proj is None.\nWhen v_proj is None (e.g. Gemma4 attention_k_eq_v), key states are reused as\nvalue states. Returns (Q, K, K) — the caller’s patched forward will use K as V.\nBecause K is returned twice, autograd accumulates gradients from both the key and\nvalue paths into dK before calling LoRA_QK.backward.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\nSupports bias, dropout, and DoRA. Dropout is applied outside the autograd\nFunction so PyTorch handles its backward automatically. A single shared\ndropout mask is used across Q, K, V projections for memory efficiency.\n\n\n\nkernels.lora.get_embedding_lora_parameters(embed)\nExtract LoRA parameters from a PEFT Embedding module.\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing:\n\n\n\ntorch.Tensor | None\n- W: base weight tensor\n\n\n\nQuantState | torch.Tensor | None\n- b: base layer bias (or None)\n\n\n\ntorch.Tensor | None\n- quant_state: quantization state (or None)\n\n\n\ntorch.Tensor | None\n- A: LoRA A weight (or None)\n\n\n\nfloat | None\n- B: LoRA B weight (or None)\n\n\n\ntorch.Tensor | None\n- s: LoRA scaling factor (or None)\n\n\n\nnn.Module | None\n- lora_bias: LoRA B bias (or None)\n\n\n\ntorch.Tensor | None\n- dropout: dropout module (or None)\n\n\n\ntuple[torch.Tensor, torch.Tensor | None, QuantState | torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, float | None, torch.Tensor | None, nn.Module | None, torch.Tensor | None]\n- magnitude: DoRA magnitude vector (or None)\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(\n    X,\n    W,\n    b,\n    W_quant,\n    A,\n    B,\n    s,\n    out=None,\n    X_drop=None,\n    lora_bias=None,\n)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | torch.Tensor | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\nX_drop\ntorch.Tensor | None\nOptional dropout-applied input for LoRA path (if None, uses X)\nNone\n\n\nlora_bias\ntorch.Tensor | None\nOptional LoRA B layer bias [out_features]\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + s * X_drop @ A @ B + b + s * lora_bias"
   },
   {
-    "objectID": "docs/api/cli.vllm_serve.html#functions",
-    "href": "docs/api/cli.vllm_serve.html#functions",
-    "title": "cli.vllm_serve",
+    "objectID": "docs/api/kernels.lora.html#classes",
+    "href": "docs/api/kernels.lora.html#classes",
+    "title": "kernels.lora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
+    "text": "Name\nDescription\n\n\n\n\nLoRA_Embedding\nFused LoRA embedding: F.embedding(x, W) + s * F.embedding(x, A^T) @ B^T.\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QK\nOptimized LoRA QK implementation for models where v_proj is None.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_Embedding()\nFused LoRA embedding: F.embedding(x, W) + s * F.embedding(x, A^T) @ B^T.\nSupports dropout and DoRA.\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\nSupports bias, dropout, and DoRA. Dropout is applied to the input for\ngate/up projections. The down projection uses hidden states (post-activation)\nas input, so dropout is not applied there.\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.LoRA_QK()\nOptimized LoRA QK implementation for models where v_proj is None.\nUsed by models like Gemma4 with attention_k_eq_v=True, where key states are\nreused as value states. Only Q and K projections are fused; the caller\nreturns K a second time as V so that autograd accumulates key+value gradients\ninto a single dK.\nSupports bias, dropout, and DoRA (Weight-Decomposed Low-Rank Adaptation).\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nSupports bias, dropout, and DoRA (Weight-Decomposed Low-Rank Adaptation).\nDropout is applied outside this Function so autograd handles its backward."
   },
   {
-    "objectID": "docs/api/utils.quantization.html",
-    "href": "docs/api/utils.quantization.html",
-    "title": "utils.quantization",
+    "objectID": "docs/api/kernels.lora.html#functions",
+    "href": "docs/api/kernels.lora.html#functions",
+    "title": "kernels.lora",
     "section": "",
-    "text": "utils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_qat_model\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\nget_quantization_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model\nThis function is used to quantize a model.\n\n\n\n\n\nutils.quantization.convert_qat_model(model, quantize_embedding=False)\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\n\nutils.quantization.get_quantization_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
+    "text": "Name\nDescription\n\n\n\n\napply_lora_embedding\nApplies LoRA to embedding layer.\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qk\nApplies LoRA to compute Query and Key projections for models where v_proj is None.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_embedding_lora_parameters\nExtract LoRA parameters from a PEFT Embedding module.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_embedding(self, x)\nApplies LoRA to embedding layer.\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_qk(self, X, inplace=True)\nApplies LoRA to compute Query and Key projections for models where v_proj is None.\nWhen v_proj is None (e.g. Gemma4 attention_k_eq_v), key states are reused as\nvalue states. Returns (Q, K, K) — the caller’s patched forward will use K as V.\nBecause K is returned twice, autograd accumulates gradients from both the key and\nvalue paths into dK before calling LoRA_QK.backward.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\nSupports bias, dropout, and DoRA. Dropout is applied outside the autograd\nFunction so PyTorch handles its backward automatically. A single shared\ndropout mask is used across Q, K, V projections for memory efficiency.\n\n\n\nkernels.lora.get_embedding_lora_parameters(embed)\nExtract LoRA parameters from a PEFT Embedding module.\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing:\n\n\n\ntorch.Tensor | None\n- W: base weight tensor\n\n\n\nQuantState | torch.Tensor | None\n- b: base layer bias (or None)\n\n\n\ntorch.Tensor | None\n- quant_state: quantization state (or None)\n\n\n\ntorch.Tensor | None\n- A: LoRA A weight (or None)\n\n\n\nfloat | None\n- B: LoRA B weight (or None)\n\n\n\ntorch.Tensor | None\n- s: LoRA scaling factor (or None)\n\n\n\nnn.Module | None\n- lora_bias: LoRA B bias (or None)\n\n\n\ntorch.Tensor | None\n- dropout: dropout module (or None)\n\n\n\ntuple[torch.Tensor, torch.Tensor | None, QuantState | torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, float | None, torch.Tensor | None, nn.Module | None, torch.Tensor | None]\n- magnitude: DoRA magnitude vector (or None)\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(\n    X,\n    W,\n    b,\n    W_quant,\n    A,\n    B,\n    s,\n    out=None,\n    X_drop=None,\n    lora_bias=None,\n)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | torch.Tensor | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\nX_drop\ntorch.Tensor | None\nOptional dropout-applied input for LoRA path (if None, uses X)\nNone\n\n\nlora_bias\ntorch.Tensor | None\nOptional LoRA B layer bias [out_features]\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + s * X_drop @ A @ B + b + s * lora_bias"
   },
   {
-    "objectID": "docs/api/utils.quantization.html#functions",
-    "href": "docs/api/utils.quantization.html#functions",
-    "title": "utils.quantization",
+    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
+    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
+    "title": "monkeypatch.data.batch_dataset_fetcher",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nconvert_qat_model\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\nget_quantization_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model\nThis function is used to quantize a model.\n\n\n\n\n\nutils.quantization.convert_qat_model(model, quantize_embedding=False)\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\n\nutils.quantization.get_quantization_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
+    "text": "monkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
   },
   {
-    "objectID": "docs/api/utils.collators.mamba.html",
-    "href": "docs/api/utils.collators.mamba.html",
-    "title": "utils.collators.mamba",
+    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
+    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
+    "title": "monkeypatch.data.batch_dataset_fetcher",
     "section": "",
-    "text": "utils.collators.mamba\ncollators for Mamba\n\n\n\n\n\nName\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
+    "text": "Name\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
   },
   {
-    "objectID": "docs/api/utils.collators.mamba.html#classes",
-    "href": "docs/api/utils.collators.mamba.html#classes",
-    "title": "utils.collators.mamba",
+    "objectID": "docs/api/loaders.constants.html",
+    "href": "docs/api/loaders.constants.html",
+    "title": "loaders.constants",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
+    "text": "loaders.constants\nloaders.constants\nShared constants for axolotl.loaders module"
   },
   {
-    "objectID": "docs/api/kernels.geglu.html",
-    "href": "docs/api/kernels.geglu.html",
-    "title": "kernels.geglu",
+    "objectID": "docs/api/utils.dict.html",
+    "href": "docs/api/utils.dict.html",
+    "title": "utils.dict",
     "section": "",
-    "text": "kernels.geglu\nModule for definition of GEGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "utils.dict\nModule containing the DictDefault class\n\n\n\n\n\nName\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
   },
   {
-    "objectID": "docs/api/kernels.geglu.html#functions",
-    "href": "docs/api/kernels.geglu.html#functions",
-    "title": "kernels.geglu",
+    "objectID": "docs/api/utils.dict.html#classes",
+    "href": "docs/api/utils.dict.html#classes",
+    "title": "utils.dict",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "Name\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys."
   },
   {
-    "objectID": "docs/api/core.trainers.utils.html",
-    "href": "docs/api/core.trainers.utils.html",
-    "title": "core.trainers.utils",
+    "objectID": "docs/api/utils.dict.html#functions",
+    "href": "docs/api/utils.dict.html#functions",
+    "title": "utils.dict",
     "section": "",
-    "text": "core.trainers.utils\ncore.trainers.utils\nUtils for Axolotl trainers"
+    "text": "Name\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
-    "title": "monkeypatch.llama_attn_hijack_flash",
+    "objectID": "docs/api/utils.schemas.config.html",
+    "href": "docs/api/utils.schemas.config.html",
+    "title": "utils.schemas.config",
     "section": "",
-    "text": "monkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\n\n\n\nName\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
+    "text": "utils.schemas.config\nModule with Pydantic models for configuration.\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nWrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\nEBFTConfig\nConfiguration for Energy-Based Fine-Tuning (EBFT)\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nWrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options.\n\n\n\nutils.schemas.config.EBFTConfig()\nConfiguration for Energy-Based Fine-Tuning (EBFT)"
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
-    "title": "monkeypatch.llama_attn_hijack_flash",
+    "objectID": "docs/api/utils.schemas.config.html#classes",
+    "href": "docs/api/utils.schemas.config.html#classes",
+    "title": "utils.schemas.config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nWrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\nEBFTConfig\nConfiguration for Energy-Based Fine-Tuning (EBFT)\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nWrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options.\n\n\n\nutils.schemas.config.EBFTConfig()\nConfiguration for Energy-Based Fine-Tuning (EBFT)"
   },
   {
-    "objectID": "docs/api/common.architectures.html",
-    "href": "docs/api/common.architectures.html",
-    "title": "common.architectures",
+    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
+    "title": "monkeypatch.mistral_attn_hijack_flash",
     "section": "",
-    "text": "common.architectures\ncommon.architectures\nCommon architecture specific constants"
+    "text": "monkeypatch.mistral_attn_hijack_flash\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model"
   },
   {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html",
-    "title": "cli.merge_sharded_fsdp_weights",
+    "objectID": "docs/api/kernels.utils.html",
+    "href": "docs/api/kernels.utils.html",
+    "title": "kernels.utils",
     "section": "",
-    "text": "cli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
+    "text": "kernels.utils\nkernels.utils\nUtilities for axolotl.kernels submodules."
   },
   {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
-    "title": "cli.merge_sharded_fsdp_weights",
+    "objectID": "docs/api/prompt_strategies.chat_template.html",
+    "href": "docs/api/prompt_strategies.chat_template.html",
+    "title": "prompt_strategies.chat_template",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading."
+    "text": "prompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n    real_last_index=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n    content_only=False,\n    reasoning_only=False,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncontent_only\nbool\nIf True and the turn has reasoning_content (template_thinking_key), preserve reasoning_content in the dummy turn so the diff only captures the content field boundaries. This is needed for correct training_detail alignment when reasoning_content is present.\nFalse\n\n\nreasoning_only\nbool\nIf True, preserve content in the dummy turn and replace reasoning_content with a dummy, so the diff only captures the reasoning_content field boundaries.\nFalse\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
   },
   {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
-    "title": "cli.merge_sharded_fsdp_weights",
+    "objectID": "docs/api/prompt_strategies.chat_template.html#classes",
+    "href": "docs/api/prompt_strategies.chat_template.html#classes",
+    "title": "prompt_strategies.chat_template",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
+    "text": "Name\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n    real_last_index=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n    content_only=False,\n    reasoning_only=False,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncontent_only\nbool\nIf True and the turn has reasoning_content (template_thinking_key), preserve reasoning_content in the dummy turn so the diff only captures the content field boundaries. This is needed for correct training_detail alignment when reasoning_content is present.\nFalse\n\n\nreasoning_only\nbool\nIf True, preserve content in the dummy turn and replace reasoning_content with a dummy, so the diff only captures the reasoning_content field boundaries.\nFalse\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
   },
   {
-    "objectID": "docs/api/prompt_strategies.llama2_chat.html",
-    "href": "docs/api/prompt_strategies.llama2_chat.html",
-    "title": "prompt_strategies.llama2_chat",
+    "objectID": "docs/api/prompt_strategies.user_defined.html",
+    "href": "docs/api/prompt_strategies.user_defined.html",
+    "title": "prompt_strategies.user_defined",
     "section": "",
-    "text": "prompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\nsee also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.\nThis implementation is based on the Vicuna PR and the fastchat repo, see also:\nhttps://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847\nUse dataset type: “llama2_chat” in conig.yml to use this prompt style.\nE.g. in the config.yml:\ndatasets:\n  - path: llama_finetune_train.jsonl\n    type: llama2_chat\nThe dataset itself should look like this:\n{'conversations':[{\"from\": \"human\", \"value\": \"Who are you?\"}, {\"from\": \"gpt\", \"value\": \"I am Vicuna\"},...]}\nin a jsonl file. The first message should be from the human, the second from gpt.\nFor a custom system message, the first “from” can be “system” (followed by alternating “human” and “gpt” turns).\nImportant: Don’t use “special_tokens:” in your config.yml if you are not sure what you are doing!\n\n\n\n\n\nName\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
+    "text": "prompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\n\n\n\nName\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
   },
   {
-    "objectID": "docs/api/prompt_strategies.llama2_chat.html#classes",
-    "href": "docs/api/prompt_strategies.llama2_chat.html#classes",
-    "title": "prompt_strategies.llama2_chat",
+    "objectID": "docs/api/prompt_strategies.user_defined.html#classes",
+    "href": "docs/api/prompt_strategies.user_defined.html#classes",
+    "title": "prompt_strategies.user_defined",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
+    "text": "Name\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
   },
   {
-    "objectID": "docs/api/loaders.adapter.html",
-    "href": "docs/api/loaders.adapter.html",
-    "title": "loaders.adapter",
+    "objectID": "docs/api/prompt_strategies.kto.chatml.html",
+    "href": "docs/api/prompt_strategies.kto.chatml.html",
+    "title": "prompt_strategies.kto.chatml",
     "section": "",
-    "text": "loaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
+    "text": "prompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/loaders.adapter.html#functions",
-    "href": "docs/api/loaders.adapter.html#functions",
-    "title": "loaders.adapter",
+    "objectID": "docs/api/prompt_strategies.kto.chatml.html#functions",
+    "href": "docs/api/prompt_strategies.kto.chatml.html#functions",
+    "title": "prompt_strategies.kto.chatml",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/cli.utils.fetch.html",
-    "href": "docs/api/cli.utils.fetch.html",
-    "title": "cli.utils.fetch",
+    "objectID": "docs/api/common.datasets.html",
+    "href": "docs/api/common.datasets.html",
+    "title": "common.datasets",
     "section": "",
-    "text": "cli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
+    "text": "common.datasets\nDataset loading utilities.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
   },
   {
-    "objectID": "docs/api/cli.utils.fetch.html#functions",
-    "href": "docs/api/cli.utils.fetch.html#functions",
-    "title": "cli.utils.fetch",
+    "objectID": "docs/api/common.datasets.html#classes",
+    "href": "docs/api/common.datasets.html#classes",
+    "title": "common.datasets",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
+    "text": "Name\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata."
   },
   {
-    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
-    "title": "monkeypatch.stablelm_attn_hijack_flash",
+    "objectID": "docs/api/common.datasets.html#functions",
+    "href": "docs/api/common.datasets.html#functions",
+    "title": "common.datasets",
     "section": "",
-    "text": "monkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\n\n\n\nName\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
+    "text": "Name\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
   },
   {
-    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
-    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
-    "title": "monkeypatch.stablelm_attn_hijack_flash",
+    "objectID": "docs/api/core.trainers.mixins.scheduler.html",
+    "href": "docs/api/core.trainers.mixins.scheduler.html",
+    "title": "core.trainers.mixins.scheduler",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
+    "text": "core.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
   },
   {
-    "objectID": "docs/api/cli.main.html",
-    "href": "docs/api/cli.main.html",
-    "title": "cli.main",
+    "objectID": "docs/api/core.trainers.mixins.scheduler.html#classes",
+    "href": "docs/api/core.trainers.mixins.scheduler.html#classes",
+    "title": "core.trainers.mixins.scheduler",
     "section": "",
-    "text": "cli.main\nClick CLI definitions for various axolotl commands.\n\n\n\n\n\nName\nDescription\n\n\n\n\nagent_docs\nShow agent-optimized documentation.\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nconfig_schema\nDump the full config JSON schema.\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.agent_docs(topic, list_topics)\nShow agent-optimized documentation.\nPrints reference docs designed for AI coding agents.\nThese docs are bundled with the package — no network access needed.\n\b\nExamples:\naxolotl agent-docs # overview (start here)\naxolotl agent-docs grpo # GRPO reference\naxolotl agent-docs sft # SFT reference\naxolotl agent-docs –list # list all topics\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.config_schema(output_format, field)\nDump the full config JSON schema.\nUseful for AI agents and tooling to discover all available config options,\ntheir types, defaults, and descriptions.\n\b\nExamples:\naxolotl config-schema # full JSON schema\naxolotl config-schema –format yaml # YAML format\naxolotl config-schema –field adapter # single field\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n- docs: Full documentation (Quarto markdown files)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs, docs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
+    "text": "Name\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
   },
   {
-    "objectID": "docs/api/cli.main.html#functions",
-    "href": "docs/api/cli.main.html#functions",
-    "title": "cli.main",
+    "objectID": "docs/api/core.datasets.chat.html",
+    "href": "docs/api/core.datasets.chat.html",
+    "title": "core.datasets.chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nagent_docs\nShow agent-optimized documentation.\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nconfig_schema\nDump the full config JSON schema.\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.agent_docs(topic, list_topics)\nShow agent-optimized documentation.\nPrints reference docs designed for AI coding agents.\nThese docs are bundled with the package — no network access needed.\n\b\nExamples:\naxolotl agent-docs # overview (start here)\naxolotl agent-docs grpo # GRPO reference\naxolotl agent-docs sft # SFT reference\naxolotl agent-docs –list # list all topics\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.config_schema(output_format, field)\nDump the full config JSON schema.\nUseful for AI agents and tooling to discover all available config options,\ntheir types, defaults, and descriptions.\n\b\nExamples:\naxolotl config-schema # full JSON schema\naxolotl config-schema –format yaml # YAML format\naxolotl config-schema –field adapter # single field\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n- docs: Full documentation (Quarto markdown files)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs, docs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
+    "text": "core.datasets.chat\nchat dataset module\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
   },
   {
-    "objectID": "docs/api/utils.schedulers.html",
-    "href": "docs/api/utils.schedulers.html",
-    "title": "utils.schedulers",
+    "objectID": "docs/api/core.datasets.chat.html#classes",
+    "href": "docs/api/core.datasets.chat.html#classes",
+    "title": "core.datasets.chat",
     "section": "",
-    "text": "utils.schedulers\nModule for custom LRScheduler class\n\n\n\n\n\nName\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_state_dict\nRestore state, including inner_schedule.\n\n\nstate_dict\nReturn serializable state, saving inner_schedule as its own state_dict.\n\n\n\n\n\nutils.schedulers.JaggedLRRestartScheduler.load_state_dict(state_dict)\nRestore state, including inner_schedule.\n\n\n\nutils.schedulers.JaggedLRRestartScheduler.state_dict()\nReturn serializable state, saving inner_schedule as its own state_dict.\n\n\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
+    "text": "Name\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
   },
   {
-    "objectID": "docs/api/utils.schedulers.html#classes",
-    "href": "docs/api/utils.schedulers.html#classes",
-    "title": "utils.schedulers",
+    "objectID": "docs/api/loaders.patch_manager.html",
+    "href": "docs/api/loaders.patch_manager.html",
+    "title": "loaders.patch_manager",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_state_dict\nRestore state, including inner_schedule.\n\n\nstate_dict\nReturn serializable state, saving inner_schedule as its own state_dict.\n\n\n\n\n\nutils.schedulers.JaggedLRRestartScheduler.load_state_dict(state_dict)\nRestore state, including inner_schedule.\n\n\n\nutils.schedulers.JaggedLRRestartScheduler.state_dict()\nReturn serializable state, saving inner_schedule as its own state_dict.\n\n\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0"
+    "text": "loaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\nApplies pre- and post-model load patches for various fixes and optimizations.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_build_patches\nApply patches right after model build, before post-load setup.\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_config_load_patches\nApply patches that must be set up before config loading.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\napply_pre_tokenizer_load_patches\nApply patches that must be set up before tokenizer loading.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_build_patches(model)\nApply patches right after model build, before post-load setup.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_config_load_patches(cfg)\nApply patches that must be set up before config loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoConfig.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired\n\n\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches(cfg)\nApply patches that must be set up before tokenizer loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoTokenizer.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.schedulers.html#functions",
-    "href": "docs/api/utils.schedulers.html#functions",
-    "title": "utils.schedulers",
+    "objectID": "docs/api/loaders.patch_manager.html#classes",
+    "href": "docs/api/loaders.patch_manager.html#classes",
+    "title": "loaders.patch_manager",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
+    "text": "Name\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_build_patches\nApply patches right after model build, before post-load setup.\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_config_load_patches\nApply patches that must be set up before config loading.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\napply_pre_tokenizer_load_patches\nApply patches that must be set up before tokenizer loading.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_build_patches(model)\nApply patches right after model build, before post-load setup.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_config_load_patches(cfg)\nApply patches that must be set up before config loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoConfig.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired\n\n\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches(cfg)\nApply patches that must be set up before tokenizer loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoTokenizer.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.samplers.multipack.html",
-    "href": "docs/api/utils.samplers.multipack.html",
-    "title": "utils.samplers.multipack",
+    "objectID": "docs/api/cli.utils.html",
+    "href": "docs/api/cli.utils.html",
+    "title": "cli.utils",
     "section": "",
-    "text": "utils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\ninto fixed-capacity batches to optimize memory usage and training throughput.\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    bin_size,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
+    "text": "cli.utils\ncli.utils\nInit for axolotl.cli.utils module."
   },
   {
-    "objectID": "docs/api/utils.samplers.multipack.html#classes",
-    "href": "docs/api/utils.samplers.multipack.html#classes",
-    "title": "utils.samplers.multipack",
+    "objectID": "docs/api/utils.callbacks.mlflow_.html",
+    "href": "docs/api/utils.callbacks.mlflow_.html",
+    "title": "utils.callbacks.mlflow_",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    bin_size,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs"
+    "text": "utils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
   },
   {
-    "objectID": "docs/api/utils.samplers.multipack.html#functions",
-    "href": "docs/api/utils.samplers.multipack.html#functions",
-    "title": "utils.samplers.multipack",
+    "objectID": "docs/api/utils.callbacks.mlflow_.html#classes",
+    "href": "docs/api/utils.callbacks.mlflow_.html#classes",
+    "title": "utils.callbacks.mlflow_",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
+    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
   },
   {
-    "objectID": "docs/api/core.chat.format.shared.html",
-    "href": "docs/api/core.chat.format.shared.html",
-    "title": "core.chat.format.shared",
+    "objectID": "docs/api/prompt_strategies.alpaca_instruct.html",
+    "href": "docs/api/prompt_strategies.alpaca_instruct.html",
+    "title": "prompt_strategies.alpaca_instruct",
     "section": "",
-    "text": "core.chat.format.shared\ncore.chat.format.shared\nshared functions for format transforms"
+    "text": "prompt_strategies.alpaca_instruct\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class"
   },
   {
-    "objectID": "docs/api/utils.schemas.multimodal.html",
-    "href": "docs/api/utils.schemas.multimodal.html",
-    "title": "utils.schemas.multimodal",
+    "objectID": "docs/api/logging_config.html",
+    "href": "docs/api/logging_config.html",
+    "title": "logging_config",
     "section": "",
-    "text": "utils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
+    "text": "logging_config\nCommon logging module for axolotl.\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlLogger\nLogger that applies filtering to non-axolotl loggers.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nLogger that applies filtering to non-axolotl loggers.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\nINFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records\n(i.e. non-axolotl.INFO, DEBUG, etc. by default).\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
   },
   {
-    "objectID": "docs/api/utils.schemas.multimodal.html#classes",
-    "href": "docs/api/utils.schemas.multimodal.html#classes",
-    "title": "utils.schemas.multimodal",
+    "objectID": "docs/api/logging_config.html#classes",
+    "href": "docs/api/logging_config.html#classes",
+    "title": "logging_config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
+    "text": "Name\nDescription\n\n\n\n\nAxolotlLogger\nLogger that applies filtering to non-axolotl loggers.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nLogger that applies filtering to non-axolotl loggers.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\nINFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records\n(i.e. non-axolotl.INFO, DEBUG, etc. by default).\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type"
   },
   {
-    "objectID": "docs/multimodal.html",
-    "href": "docs/multimodal.html",
-    "title": "MultiModal / Vision Language Models (BETA)",
+    "objectID": "docs/api/logging_config.html#functions",
+    "href": "docs/api/logging_config.html#functions",
+    "title": "logging_config",
     "section": "",
-    "text": "Gemma-4 (NEW)\nMllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nMistral-Small-4\nMagistral-Small-2509\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nQwen3.5\nGLM-4.6V\nSmolVLM2\nLFM2-VL\nIntern-VL",
+    "text": "Name\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
+  },
+  {
+    "objectID": "docs/api/core.builders.base.html",
+    "href": "docs/api/core.builders.base.html",
+    "title": "core.builders.base",
+    "section": "",
+    "text": "core.builders.base\nBase class for trainer builder\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
+  },
+  {
+    "objectID": "docs/api/core.builders.base.html#classes",
+    "href": "docs/api/core.builders.base.html#classes",
+    "title": "core.builders.base",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
+  },
+  {
+    "objectID": "docs/api/utils.schemas.enums.html",
+    "href": "docs/api/utils.schemas.enums.html",
+    "title": "utils.schemas.enums",
+    "section": "",
+    "text": "utils.schemas.enums\nEnums for Axolotl input config\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations"
+  },
+  {
+    "objectID": "docs/api/utils.schemas.enums.html#classes",
+    "href": "docs/api/utils.schemas.enums.html#classes",
+    "title": "utils.schemas.enums",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations"
+  },
+  {
+    "objectID": "docs/getting-started.html",
+    "href": "docs/getting-started.html",
+    "title": "Quickstart",
+    "section": "",
+    "text": "This guide will walk you through your first model fine-tuning project with Axolotl.",
     "crumbs": [
-      "How To Guides",
-      "MultiModal / Vision Language Models (BETA)"
+      "Getting Started",
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/multimodal.html#supported-models",
-    "href": "docs/multimodal.html#supported-models",
-    "title": "MultiModal / Vision Language Models (BETA)",
-    "section": "",
-    "text": "Gemma-4 (NEW)\nMllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nMistral-Small-4\nMagistral-Small-2509\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nQwen3.5\nGLM-4.6V\nSmolVLM2\nLFM2-VL\nIntern-VL",
+    "objectID": "docs/getting-started.html#sec-quick-example",
+    "href": "docs/getting-started.html#sec-quick-example",
+    "title": "Quickstart",
+    "section": "1 Quick Example",
+    "text": "1 Quick Example\nLet’s start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs.\nAssuming axolotl is installed (if not, see our Installation Guide)\n\nDownload example configs:\n\naxolotl fetch examples\n\nRun the training:\n\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Let’s understand what just happened.",
     "crumbs": [
-      "How To Guides",
-      "MultiModal / Vision Language Models (BETA)"
+      "Getting Started",
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/multimodal.html#usage",
-    "href": "docs/multimodal.html#usage",
-    "title": "MultiModal / Vision Language Models (BETA)",
-    "section": "Usage",
-    "text": "Usage\nMultimodal support is limited and doesn’t have full feature parity.\nHere are the hyperparams you’ll need to use to finetune a multimodal model.\nprocessor_type: AutoProcessor\n\nskip_prepare_dataset: true\nremove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training\nsample_packing: false  # not yet supported with multimodal\n\nchat_template:  # see in next section if specified\n\n# example dataset\ndatasets:\n  - path: HuggingFaceH4/llava-instruct-mix-vsft\n    type: chat_template\n    split: train[:1%]\n\n# (optional) if doing lora, only finetune the Language model,\n# leave the vision model and vision tower frozen\n# load_in_8bit: true\nadapter: lora\nlora_target_modules: 'model.language_model.layers.[\\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'\n\n# (optional) if you want to resize images to a set size\nimage_size: 512\nimage_resize_algorithm: bilinear\nPlease see examples folder for full configs.\n\n\n\n\n\n\nTip\n\n\n\nSome of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.\n\n\n\n\n\n\n\n\nNote\n\n\n\nAs of now, we do not truncate nor drop samples based on sequence_len as each arch has different ways to process non-text tokens. We are looking for help on this.\n\n\n\nMllama\nbase_model: meta-llama/Llama-3.2-11B-Vision-Instruct\n\nchat_template: llama3_2_vision\n\n\nLlama4\nbase_model: meta-llama/Llama-4-Scout-17B-16E-Instruct\n\nchat_template: llama4\n\n\nPixtral\nbase_model: mistralai/Pixtral-12B-2409\n\nchat_template: pixtral\n\n\nLlava-1.5\nbase_model: llava-hf/llava-1.5-7b-hf\n\nchat_template: llava\n\n\nMistral-Small-3.1\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install vision lib via pip install 'mistral-common[opencv]==1.8.5'\n\n\nbase_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503\n\n\nMistral-Small-4\nbase_model: mistralai/Mistral-Small-4-119B-2603\n\n\nMagistral-Small-2509\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install vision lib via pip install 'mistral-common[opencv]==1.8.5'\n\n\nbase_model: mistralai/Magistral-Small-2509\n\n\nVoxtral\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install audio lib via pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'\n\n\nbase_model: mistralai/Voxtral-Mini-3B-2507\n\nprocessor_type: VoxtralProcessor\n\n\nGemma-4\nAll Gemma 4 variants (E2B, E4B, 26B-A4B, 31B) load as multimodal models even for text-only training.\nbase_model: google/gemma-4-E2B-it  # or E4B-it, 26B-A4B, 31B\n\nchat_template: gemma4\nfreeze_mm_modules: true  # freeze vision/audio encoders for text-only or vision LoRA\n\n# For the 26B-A4B MoE model, enable ScatterMoE and expert LoRA:\nplugins:\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n  - axolotl.integrations.kernels.KernelsPlugin\nuse_kernels: true\nuse_scattermoe: true\nexperts_implementation: scattermoe\n\nlora_target_modules: 'model.language_model.layers.[\\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'\n\n# MoE expert LoRA (3D tensors, not nn.Linear) — only for 26B-A4B:\nlora_target_parameters:\n  - experts.gate_up_proj\n  - experts.down_proj\n\n\n\n\n\n\nWarning\n\n\n\nGemma 4 VLM training starts with high loss (~8-15). This is expected — see the training stability guide for details.\n\n\n\n\n\n\n\n\nTip\n\n\n\nFor DDP training, axolotl auto-detects Gemma4 and sets use_reentrant=False and ddp_find_unused_parameters=True. However, when activation_offloading: true, ddp_find_unused_parameters is skipped (checkpoint wrappers conflict with it); use freeze_mm_modules: true instead to handle unused vision/audio params. For FSDP2, use fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer.\n\n\n\n\nGemma-3\n\n\n\n\n\n\nTip\n\n\n\nThe Gemma3-1B model is a text-only model, so please train as regular text model.\n\n\nFor multi-modal 4B/12B/27B models, use the following config:\nbase_model: google/gemma-3-4b-it\n\nchat_template: gemma3\n\n\nGemma-3n\n\n\n\n\n\n\nWarning\n\n\n\nThe model’s initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers.\n\n\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install timm via pip3 install timm==1.0.17\n\n\nbase_model: google/gemma-3n-E2B-it\n\nchat_template: gemma3n\n\n\nQwen2-VL\nbase_model: Qwen/Qwen2-VL-7B-Instruct\n\nchat_template: qwen2_vl\n\n\nQwen2.5-VL\nbase_model: Qwen/Qwen2.5-VL-7B-Instruct\n\nchat_template: qwen2_vl  # same as qwen2-vl\n\n\nQwen3-VL\nbase_model: Qwen/Qwen3-VL-4B-Instruct\n\nchat_template: qwen2_vl  # same as qwen2-vl\n\n\nQwen3.5\nbase_model: Qwen/Qwen3.5-9B\n\nchat_template: qwen3_5\n\n\nGLM-4.6V\nBoth GLM-4.6V (106B MoE) and GLM-4.6V-Flash (9B) are supported.\n# GLM-4.6V (106B MoE version)\nbase_model: zai-org/GLM-4.6V\n\n# OR GLM-4.6V-Flash (9B version)\nbase_model: zai-org/GLM-4.6V-Flash\n\n\nSmolVLM2\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install num2words via pip3 install num2words==0.5.14\n\n\nbase_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct\n\n\nLFM2-VL\n\n\n\n\n\n\nWarning\n\n\n\nPlease uninstall causal-conv1d via pip3 uninstall -y causal-conv1d\n\n\nbase_model: LiquidAI/LFM2-VL-450M\n\n\nIntern-VL\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install timm via pip3 install timm==1.0.19\n\n\nbase_model: OpenGVLab/InternVL3_5-8B",
+    "objectID": "docs/getting-started.html#sec-understanding",
+    "href": "docs/getting-started.html#sec-understanding",
+    "title": "Quickstart",
+    "section": "2 Understanding the Process",
+    "text": "2 Understanding the Process\n\n2.1 The Configuration File\nThe YAML configuration file controls everything about your training. Here’s what (part of) our example config looks like:\nbase_model: NousResearch/Llama-3.2-1B\n\nload_in_8bit: true\nadapter: lora\n\ndatasets:\n  - path: teknium/GPT4-LLM-Cleaned\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.1\noutput_dir: ./outputs/lora-out\n\n\n\n\n\n\nTip\n\n\n\nload_in_8bit: true and adapter: lora enables LoRA adapter finetuning.\n\nTo perform Full finetuning, remove these two lines.\nTo perform QLoRA finetuning, replace with load_in_4bit: true and adapter: qlora.\n\n\n\nSee our config options for more details.\n\n\n2.2 Training\nWhen you run axolotl train, Axolotl:\n\nDownloads the base model\n(If specified) applies QLoRA/LoRA adapter layers\nLoads and processes the dataset\nRuns the training loop\nSaves the trained model and / or LoRA weights",
     "crumbs": [
-      "How To Guides",
-      "MultiModal / Vision Language Models (BETA)"
+      "Getting Started",
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/multimodal.html#dataset-format",
-    "href": "docs/multimodal.html#dataset-format",
-    "title": "MultiModal / Vision Language Models (BETA)",
-    "section": "Dataset Format",
-    "text": "Dataset Format\nFor multi-modal datasets, we adopt an extended chat_template format similar to OpenAI’s Message format.\n\nA message is a list of role and content.\nrole can be system, user, assistant, etc.\ncontent is a list of type and (text, image, path, url, base64, or audio).\n\n\nImage\n\n\n\n\n\n\nNote\n\n\n\nFor backwards compatibility:\n\nIf the dataset has a images or image column of list[Image], it will be appended to the first content list as {\"type\": \"image\", \"image\": ...}. However, if the content already has a {\"type\": \"image\"} but no image key, it will be set the image key.\nIf content is a string, it will be converted to a list with type as text.\n\n\n\nFor image loading, you can use the following keys within content alongside \"type\": \"image\":\n\n\"path\": \"/path/to/image.jpg\"\n\"url\": \"https://example.com/image.jpg\"\n\"base64\": \"...\"\n\"image\": PIL.Image\n\n\n\nAudio\nFor audio loading, you can use the following keys within content alongside \"type\": \"audio\":\n\n\"path\": \"/path/to/audio.mp3\"\n\"url\": \"https://example.com/audio.mp3\"\n\"audio\": np.ndarray\n\n\n\n\n\n\n\nTip\n\n\n\nYou may need to install librosa via pip3 install librosa==0.11.0.\n\n\n\n\nVideo\n\n\n\n\n\n\nWarning\n\n\n\nThis is not well tested at the moment. We welcome contributors!\n\n\nFor video loading, you can use the following keys within content alongside \"type\": \"video\":\n\n\"path\": \"/path/to/video.mp4\"\n\"url\": \"https://example.com/video.mp4\"\n\"video\": np.ndarray | list[PIL.Image.Image] | torch.Tensor (or list of the aforementioned)\n\n\n\nExample\nHere is an example of a multi-modal dataset:\n[\n  {\n    \"messages\": [\n        {\n            \"role\": \"system\",\n            \"content\": [\n              {\"type\": \"text\", \"text\": \"You are a helpful assistant.\"}\n              ]\n        },\n        {\n            \"role\": \"user\",\n            \"content\": [\n                {\"type\": \"image\", \"url\": \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg\"},\n                {\"type\": \"text\", \"text\": \"Describe this image in detail.\"}\n            ]\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": [\n              {\"type\": \"text\", \"text\": \"The image is a bee.\"}\n            ]\n        }\n    ]\n  }\n]",
+    "objectID": "docs/getting-started.html#sec-custom",
+    "href": "docs/getting-started.html#sec-custom",
+    "title": "Quickstart",
+    "section": "3 Your First Custom Training",
+    "text": "3 Your First Custom Training\nLet’s modify the example for your own data:\n\nCreate a new config file my_training.yml:\n\nbase_model: NousResearch/Nous-Hermes-llama-1b-v1\n\nload_in_8bit: true\nadapter: lora\n\n# Training settings\nmicro_batch_size: 2\nnum_epochs: 3\nlearning_rate: 0.0003\n\n# Your dataset\ndatasets:\n  - path: my_data.jsonl        # Your local data file\n    type: alpaca               # Or other format\nThis specific config is for LoRA fine-tuning a model with instruction tuning data using\nthe alpaca dataset format, which has the following format:\n{\n    \"instruction\": \"Write a description of alpacas.\",\n    \"input\": \"\",\n    \"output\": \"Alpacas are domesticated South American camelids...\"\n}\nPlease see our Dataset Formats for more dataset formats and how to\nformat them.\n\nPrepare your JSONL data in the specified format (in this case, the expected alpaca\nformat):\n\n{\"instruction\": \"Classify this text\", \"input\": \"I love this!\", \"output\": \"positive\"}\n{\"instruction\": \"Classify this text\", \"input\": \"Not good at all\", \"output\": \"negative\"}\n\nRun the training:\n\naxolotl train my_training.yml",
     "crumbs": [
-      "How To Guides",
-      "MultiModal / Vision Language Models (BETA)"
+      "Getting Started",
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/multimodal.html#faq",
-    "href": "docs/multimodal.html#faq",
-    "title": "MultiModal / Vision Language Models (BETA)",
-    "section": "FAQ",
-    "text": "FAQ\n\nPIL.UnidentifiedImageError: cannot identify image file ...\n\nPIL could not retrieve the file at url using requests. Please check for typo. One alternative reason is that the request is blocked by the server.",
+    "objectID": "docs/getting-started.html#sec-common-tasks",
+    "href": "docs/getting-started.html#sec-common-tasks",
+    "title": "Quickstart",
+    "section": "4 Common Tasks",
+    "text": "4 Common Tasks\n\n\n\n\n\n\nTip\n\n\n\nThe same yaml file is used for training, inference, and merging.\n\n\n\n4.1 Testing Your Model\nAfter training, test your model:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nMore details can be found in Inference.\n\n\n4.2 Using a UI\nLaunch a Gradio interface:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\" --gradio\n\n\n4.3 Preprocessing Data\nFor large datasets, preprocess first:\naxolotl preprocess my_training.yml\nPlease make sure to set dataset_prepared_path: in your config to set the path to save the prepared dataset.\nMore details can be found in Dataset Preprocessing.\n\n\n4.4 Merging LoRA weights\nTo merge the LoRA weights back into the base model, run:\naxolotl merge-lora my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nThe merged model will be saved in the {output_dir}/merged directory.\nMore details can be found in Merging LoRA weights.",
     "crumbs": [
-      "How To Guides",
-      "MultiModal / Vision Language Models (BETA)"
+      "Getting Started",
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/input_output.html",
-    "href": "docs/input_output.html",
-    "title": "Template-free prompt construction",
-    "section": "",
-    "text": "The documentation moved to here."
+    "objectID": "docs/getting-started.html#sec-next-steps",
+    "href": "docs/getting-started.html#sec-next-steps",
+    "title": "Quickstart",
+    "section": "5 Next Steps",
+    "text": "5 Next Steps\nNow that you have the basics, explore these guides based on what you want to do:\nChoose your path:\n\nChoosing a Fine-Tuning Method — SFT vs LoRA vs QLoRA vs GRPO vs DPO, with hardware recommendations\n\nCore guides:\n\nDataset Loading — Loading datasets from various sources\nDataset Formats — Working with different data formats\nOptimizations — Flash attention, gradient checkpointing, sample packing\nTraining Stability & Debugging — Monitoring metrics, fixing NaN, OOM debugging\n\nAdvanced training methods:\n\nRLHF / Preference Learning — DPO, KTO, GRPO, EBFT\nGRPO Training — RL with custom rewards and vLLM generation\nvLLM Serving — Setting up vLLM for GRPO\n\nScaling up:\n\nMulti-GPU Training — DeepSpeed, FSDP, DDP\nMulti-Node Training — Distributed training across machines",
+    "crumbs": [
+      "Getting Started",
+      "Quickstart"
+    ]
   },
   {
     "objectID": "docs/multi-gpu.html",
@@ -3349,18 +3388,18 @@
     "href": "docs/installation.html#sec-requirements",
     "title": "Installation",
     "section": "1 Requirements",
-    "text": "1 Requirements\n\nNVIDIA GPU (Ampere architecture or newer for bf16 and Flash Attention) or AMD GPU\nPython ≥3.11\nPyTorch ≥2.6.0",
+    "text": "1 Requirements\n\nNVIDIA GPU (Ampere architecture or newer for bf16 and Flash Attention) or AMD GPU\nPython ≥3.11\nPyTorch ≥2.9.0",
     "crumbs": [
       "Getting Started",
       "Installation"
     ]
   },
   {
-    "objectID": "docs/installation.html#sec-installation-methods",
-    "href": "docs/installation.html#sec-installation-methods",
+    "objectID": "docs/installation.html#sec-installation",
+    "href": "docs/installation.html#sec-installation",
     "title": "Installation",
-    "section": "2 Installation Methods",
-    "text": "2 Installation Methods\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure to have Pytorch installed before installing Axolotl in your local environment.\nFollow the instructions at: https://pytorch.org/get-started/locally/\n\n\n\n\n\n\n\n\nImportant\n\n\n\nFor Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.\n\n\n\n2.1 PyPI Installation (Recommended)\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\nWe use --no-build-isolation in order to detect the installed PyTorch version (if\ninstalled) in order not to clobber it, and so that we set the correct version of\ndependencies that are specific to the PyTorch version or other installed\nco-dependencies.\n\n\n2.2 uv Installation\nuv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.\nInstall uv if not already installed\ncurl -LsSf https://astral.sh/uv/install.sh | sh\nsource $HOME/.local/bin/env\nChoose your CUDA version to use with PyTorch; e.g. cu124, cu126, cu128,\nthen create the venv and activate\nexport UV_TORCH_BACKEND=cu126\nuv venv --no-project --relocatable\nsource .venv/bin/activate\nInstall PyTorch\n- PyTorch 2.6.0 recommended\nuv pip install packaging setuptools wheel\nuv pip install torch==2.6.0\nuv pip install awscli pydantic\nInstall axolotl from PyPi\nuv pip install --no-build-isolation axolotl[deepspeed,flash-attn]\n\n# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO\nuv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]\n\n\n2.3 Edge/Development Build\nFor the latest features between releases:\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\n2.4 Docker\ndocker run --gpus '\"all\"' --rm -it axolotlai/axolotl:main-latest\nFor development with Docker:\ndocker compose up -d\n\n\n\n\n\n\nTipAdvanced Docker Configuration\n\n\n\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it \\\n  --name axolotl --ipc=host \\\n  --ulimit memlock=-1 --ulimit stack=67108864 \\\n  --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl \\\n  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \\\n  axolotlai/axolotl:main-latest\n\n\n\n\n\n\n\n\nImportant\n\n\n\nFor Blackwell GPUs, please use axolotlai/axolotl:main-py3.11-cu128-2.9.1 or the cloud variant axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1.\n\n\nPlease refer to the Docker documentation for more information on the different Docker images that are available.",
+    "section": "2 Installation",
+    "text": "2 Installation\n\n\n\n\n\n\nImportant\n\n\n\nFor Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.\n\n\n\n2.1 Quick Install\nAxolotl uses uv as its package manager. uv is a fast, reliable Python package installer and resolver built in Rust.\nInstall uv if not already installed:\ncurl -LsSf https://astral.sh/uv/install.sh | sh\nsource $HOME/.local/bin/env\nChoose your CUDA version (e.g. cu128, cu130), create a venv, and install:\nexport UV_TORCH_BACKEND=cu128  # or cu130\nuv venv --no-project --relocatable\nsource .venv/bin/activate\nuv pip install --no-build-isolation axolotl[flash-attn,deepspeed]\n\n\n2.2 Edge/Development Build\nFor the latest features between releases:\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\nexport UV_TORCH_BACKEND=cu128  # or cu130\nuv sync --extra flash-attn --extra deepspeed\nsource .venv/bin/activate\nuv sync creates a .venv, installs exact pinned versions from uv.lock, and sets up an editable install automatically.\n\n\n2.3 Docker\ndocker run --gpus '\"all\"' --rm -it --ipc=host axolotlai/axolotl-uv:main-latest\nFor development with Docker:\ndocker compose up -d\n\n\n\n\n\n\nTipAdvanced Docker Configuration\n\n\n\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it \\\n  --name axolotl --ipc=host \\\n  --ulimit memlock=-1 --ulimit stack=67108864 \\\n  --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl \\\n  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \\\n  axolotlai/axolotl-uv:main-latest\n\n\n\n\n\n\n\n\nImportant\n\n\n\nFor Blackwell GPUs, please use axolotlai/axolotl-uv:main-py3.11-cu128-2.9.1 or the cloud variant axolotlai/axolotl-cloud-uv:main-py3.11-cu128-2.9.1.\n\n\nPlease refer to the Docker documentation for more information on the different Docker images that are available.",
     "crumbs": [
       "Getting Started",
       "Installation"
@@ -3371,7 +3410,7 @@
     "href": "docs/installation.html#sec-cloud",
     "title": "Installation",
     "section": "3 Cloud Environments",
-    "text": "3 Cloud Environments\n\n3.1 Cloud GPU Providers\nFor providers supporting Docker:\n\nUse axolotlai/axolotl-cloud:main-latest\nAvailable on:\n\nRunPod\nVast.ai\nPRIME Intellect\nModal\nNovita\nJarvisLabs.ai\nLatitude.sh\n\n\n\n\n3.2 Google Colab",
+    "text": "3 Cloud Environments\n\n3.1 Cloud GPU Providers\nFor providers supporting Docker:\n\nUse axolotlai/axolotl-cloud-uv:main-latest\nAvailable on:\n\nRunPod\nVast.ai\nPRIME Intellect\nModal\nNovita\nJarvisLabs.ai\nLatitude.sh\n\n\n\n\n3.2 Google Colab",
     "crumbs": [
       "Getting Started",
       "Installation"
@@ -3382,18 +3421,29 @@
     "href": "docs/installation.html#sec-platform-specific",
     "title": "Installation",
     "section": "4 Platform-Specific Instructions",
-    "text": "4 Platform-Specific Instructions\n\n4.1 macOS\npip3 install --no-build-isolation -e '.'\nSee Section 6 for Mac-specific issues.\n\n\n4.2 Windows\n\n\n\n\n\n\nImportant\n\n\n\nWe recommend using WSL2 (Windows Subsystem for Linux) or Docker.",
+    "text": "4 Platform-Specific Instructions\n\n4.1 macOS\nuv pip install --no-build-isolation -e '.'\nSee Section 7 for Mac-specific issues.\n\n\n4.2 Windows\n\n\n\n\n\n\nImportant\n\n\n\nWe recommend using WSL2 (Windows Subsystem for Linux) or Docker.",
     "crumbs": [
       "Getting Started",
       "Installation"
     ]
   },
   {
-    "objectID": "docs/installation.html#sec-env-managers",
-    "href": "docs/installation.html#sec-env-managers",
+    "objectID": "docs/installation.html#sec-migrating",
+    "href": "docs/installation.html#sec-migrating",
     "title": "Installation",
-    "section": "5 Environment Managers",
-    "text": "5 Environment Managers\n\n5.1 Conda/Pip venv\n\nInstall Python ≥3.11\nInstall PyTorch: https://pytorch.org/get-started/locally/\nInstall Axolotl:\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n(Optional) Login to Hugging Face:\nhf auth login",
+    "section": "5 Migrating from pip to uv",
+    "text": "5 Migrating from pip to uv\nIf you have an existing pip-based Axolotl installation, you can migrate to uv:\n# Install uv\ncurl -LsSf https://astral.sh/uv/install.sh | sh\nsource $HOME/.local/bin/env\n\n# Create a fresh venv (recommended for a clean start)\nexport UV_TORCH_BACKEND=cu128  # or cu130\nuv venv --no-project --relocatable\nsource .venv/bin/activate\n\n# Reinstall axolotl\nuv pip install --no-build-isolation axolotl[flash-attn,deepspeed]",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-pip",
+    "href": "docs/installation.html#sec-pip",
+    "title": "Installation",
+    "section": "6 Using pip (Alternative)",
+    "text": "6 Using pip (Alternative)\nIf you are unable to install uv, you can still use pip directly.\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure to have PyTorch installed before installing Axolotl with pip.\nFollow the instructions at: https://pytorch.org/get-started/locally/\n\n\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\nFor editable/development installs:\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'",
     "crumbs": [
       "Getting Started",
       "Installation"
@@ -3403,8 +3453,8 @@
     "objectID": "docs/installation.html#sec-troubleshooting",
     "href": "docs/installation.html#sec-troubleshooting",
     "title": "Installation",
-    "section": "6 Troubleshooting",
-    "text": "6 Troubleshooting\nIf you encounter installation issues, see our FAQ and Debugging Guide.",
+    "section": "7 Troubleshooting",
+    "text": "7 Troubleshooting\nIf you encounter installation issues, see our FAQ and Debugging Guide.",
     "crumbs": [
       "Getting Started",
       "Installation"
@@ -3727,7 +3777,7 @@
     "href": "docs/docker.html",
     "title": "Docker",
     "section": "",
-    "text": "This section describes the different Docker images that are released by AxolotlAI at Docker Hub.",
+    "text": "This section describes the different Docker images that are released by AxolotlAI at\nDocker Hub.",
     "crumbs": [
       "Deployments",
       "Docker"
@@ -3738,7 +3788,7 @@
     "href": "docs/docker.html#base",
     "title": "Docker",
     "section": "Base",
-    "text": "Base\nThe base image is the most minimal image that can install Axolotl. It is based on the nvidia/cuda image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.\n\nImage\naxolotlai/axolotl-base\nLink: Docker Hub\n\n\nTags format\nmain-base-py{python_version}-cu{cuda_version}-{pytorch_version}\nTags examples:\n\nmain-base-py3.11-cu128-2.8.0\nmain-base-py3.11-cu128-2.9.1",
+    "text": "Base\nThe base image is the most minimal image that can install Axolotl. It is based on the nvidia/cuda image.\nIt includes python, torch, git, git-lfs, awscli, pydantic, and more.\n\nImage\n\n\n\nVariant\nImage\nDocker Hub\n\n\n\n\npip\naxolotlai/axolotl-base\nLink\n\n\nuv\naxolotlai/axolotl-base-uv\nLink\n\n\n\n\n\nTags format\nmain-base-py{python_version}-cu{cuda_version}-{pytorch_version}\nTags examples:\n\nmain-base-py3.11-cu128-2.9.1\nmain-base-py3.12-cu128-2.10.0\nmain-base-py3.12-cu130-2.9.1\nmain-base-py3.12-cu130-2.10.0",
     "crumbs": [
       "Deployments",
       "Docker"
@@ -3749,7 +3799,7 @@
     "href": "docs/docker.html#main",
     "title": "Docker",
     "section": "Main",
-    "text": "Main\nThe main image is the image that is used to run Axolotl. It is based on the axolotlai/axolotl-base image and includes the Axolotl codebase, dependencies, and more.\n\nImage\naxolotlai/axolotl\nLink: Docker Hub\n\n\nTags format\n# on push to main\nmain-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)\nmain-latest\n\n# nightly build\n{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# tagged release\n{version}\n\n\n\n\n\n\nTip\n\n\n\nThere may be some extra tags appended to the image, like -vllm which installs those packages.\n\n\nTags examples:\n\nmain-py3.11-cu128-2.8.0\nmain-py3.11-cu128-2.9.1\nmain-latest\nmain-20250303-py3.11-cu124-2.6.0\nmain-20250303-py3.11-cu126-2.6.0\n0.12.0",
+    "text": "Main\nThe main image is the image that is used to run Axolotl. It is based on the axolotlai/axolotl-base image and includes the Axolotl codebase, dependencies, and more.\n\nImage\n\n\n\nVariant\nImage\nDocker Hub\n\n\n\n\npip\naxolotlai/axolotl\nLink\n\n\nuv\naxolotlai/axolotl-uv\nLink\n\n\n\n\n\nTags format\n# on push to main\nmain-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# latest main (currently torch 2.9.1, python 3.11, cuda 12.8)\nmain-latest\n\n# nightly build\n{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# tagged release\n{version}\n\n\n\n\n\n\nTip\n\n\n\nThere may be some extra tags appended to the image, like -vllm which installs those packages.\n\n\nTags examples:\n\nmain-py3.11-cu128-2.9.1\nmain-py3.12-cu128-2.10.0\nmain-py3.12-cu130-2.9.1\nmain-py3.12-cu130-2.10.0\nmain-latest\nmain-20260315-py3.11-cu128-2.9.1\n0.12.0",
     "crumbs": [
       "Deployments",
       "Docker"
@@ -3760,7 +3810,7 @@
     "href": "docs/docker.html#cloud",
     "title": "Docker",
     "section": "Cloud",
-    "text": "Cloud\nThe cloud image is the image that is used to run Axolotl in the cloud. It is based on the axolotlai/axolotl image and sets ENV variables like HuggingFace cache directories for volume mounts, tmux, and more for different cloud providers.\n\n\n\n\n\n\nTip\n\n\n\nJupyter lab is run by default. Set JUPYTER_DISABLE=1 in the environment variables to disable it.\n\n\n\nImage\naxolotlai/axolotl-cloud\nLink: Docker Hub\n\n\nTags format\nThis uses the same tags as the main image.\n\n\nEnvironment variables\n\nJUPYTER_DISABLE: Disable Jupyter lab.\nJUPYTER_PASSWORD: Set a password for the Jupyter lab.\nPUBLIC_KEY / SSH_KEY: Add a public key for the SSH service.\n\n\n\nVolume mounts\n\n\n\n\n\n\nTip\n\n\n\nWe recommend mounting volumes to /workspace/data for data persistence. /workspace/axolotl contains the source code and is ephemeral.\n\n\n\n/workspace/data/axolotl-artifacts: Directory to store Axolotl artifacts.\n/workspace/data/huggingface-cache: Directory to store HuggingFace cache.",
+    "text": "Cloud\nThe cloud image is the image that is used to run Axolotl in the cloud. It is based on the axolotlai/axolotl image and sets ENV variables like HuggingFace cache directories for volume mounts, tmux, and more for different cloud providers.\n\n\n\n\n\n\nTip\n\n\n\nJupyter lab is run by default. Set JUPYTER_DISABLE=1 in the environment variables to disable it.\n\n\n\nImage\n\n\n\nVariant\nImage\nDocker Hub\n\n\n\n\npip\naxolotlai/axolotl-cloud\nLink\n\n\nuv\naxolotlai/axolotl-cloud-uv\nLink\n\n\n\n\n\nTags format\nThis uses the same tags as the main image.\n\n\nEnvironment variables\n\nJUPYTER_DISABLE: Disable Jupyter lab.\nJUPYTER_PASSWORD: Set a password for the Jupyter lab.\nPUBLIC_KEY / SSH_KEY: Add a public key for the SSH service.\n\n\n\nVolume mounts\n\n\n\n\n\n\nTip\n\n\n\nWe recommend mounting volumes to /workspace/data for data persistence. /workspace/axolotl contains the source code and is ephemeral.\n\n\n\n/workspace/data/axolotl-artifacts: Directory to store Axolotl artifacts.\n/workspace/data/huggingface-cache: Directory to store HuggingFace cache.",
     "crumbs": [
       "Deployments",
       "Docker"
@@ -4482,914 +4532,857 @@
     ]
   },
   {
-    "objectID": "docs/unsloth.html",
-    "href": "docs/unsloth.html",
-    "title": "Unsloth",
+    "objectID": "docs/input_output.html",
+    "href": "docs/input_output.html",
+    "title": "Template-free prompt construction",
     "section": "",
-    "text": "Overview\nUnsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over\nstandard industry baselines.\n\n\n\n\n\n\nImportant\n\n\n\nDue to breaking changes in transformers v4.48.0, users will need to downgrade to &lt;=v4.47.1 to use this patch.\nThis will later be deprecated in favor of LoRA Optimizations.\n\n\n\n\nInstallation\nThe following will install the correct unsloth and extras from source.\npython scripts/unsloth_install.py | sh\n\n\nUsage\nAxolotl exposes a few configuration options to try out unsloth and get most of the performance gains.\nOur unsloth integration is currently limited to the following model architectures:\n- llama\nThese options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning\nunsloth_lora_mlp: true\nunsloth_lora_qkv: true\nunsloth_lora_o: true\nThese options are composable and can be used with multi-gpu finetuning\nunsloth_cross_entropy_loss: true\nunsloth_rms_norm: true\nunsloth_rope: true\n\n\nLimitations\n\nSingle GPU only; e.g. no multi-gpu support\nNo deepspeed or FSDP support (requires multi-gpu)\nLoRA + QLoRA support only. No full fine tunes or fp8 support.\nLimited model architecture support. Llama, Phi, Gemma, Mistral only\nNo MoE support.",
+    "text": "The documentation moved to here."
+  },
+  {
+    "objectID": "docs/multimodal.html",
+    "href": "docs/multimodal.html",
+    "title": "MultiModal / Vision Language Models (BETA)",
+    "section": "",
+    "text": "Gemma-4 (NEW)\nMllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nMistral-Small-4\nMagistral-Small-2509\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nQwen3.5\nGLM-4.6V\nSmolVLM2\nLFM2-VL\nIntern-VL",
     "crumbs": [
-      "Advanced Features",
-      "Unsloth"
+      "How To Guides",
+      "MultiModal / Vision Language Models (BETA)"
     ]
   },
   {
-    "objectID": "docs/getting-started.html",
-    "href": "docs/getting-started.html",
-    "title": "Quickstart",
+    "objectID": "docs/multimodal.html#supported-models",
+    "href": "docs/multimodal.html#supported-models",
+    "title": "MultiModal / Vision Language Models (BETA)",
     "section": "",
-    "text": "This guide will walk you through your first model fine-tuning project with Axolotl.",
+    "text": "Gemma-4 (NEW)\nMllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nMistral-Small-4\nMagistral-Small-2509\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nQwen3.5\nGLM-4.6V\nSmolVLM2\nLFM2-VL\nIntern-VL",
     "crumbs": [
-      "Getting Started",
-      "Quickstart"
+      "How To Guides",
+      "MultiModal / Vision Language Models (BETA)"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-quick-example",
-    "href": "docs/getting-started.html#sec-quick-example",
-    "title": "Quickstart",
-    "section": "1 Quick Example",
-    "text": "1 Quick Example\nLet’s start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs.\nAssuming axolotl is installed (if not, see our Installation Guide)\n\nDownload example configs:\n\naxolotl fetch examples\n\nRun the training:\n\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Let’s understand what just happened.",
+    "objectID": "docs/multimodal.html#usage",
+    "href": "docs/multimodal.html#usage",
+    "title": "MultiModal / Vision Language Models (BETA)",
+    "section": "Usage",
+    "text": "Usage\nMultimodal support is limited and doesn’t have full feature parity.\nHere are the hyperparams you’ll need to use to finetune a multimodal model.\nprocessor_type: AutoProcessor\n\nskip_prepare_dataset: true\nremove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training\nsample_packing: false  # not yet supported with multimodal\n\nchat_template:  # see in next section if specified\n\n# example dataset\ndatasets:\n  - path: HuggingFaceH4/llava-instruct-mix-vsft\n    type: chat_template\n    split: train[:1%]\n\n# (optional) if doing lora, only finetune the Language model,\n# leave the vision model and vision tower frozen\n# load_in_8bit: true\nadapter: lora\nlora_target_modules: 'model.language_model.layers.[\\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'\n\n# (optional) if you want to resize images to a set size\nimage_size: 512\nimage_resize_algorithm: bilinear\nPlease see examples folder for full configs.\n\n\n\n\n\n\nTip\n\n\n\nSome of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.\n\n\n\n\n\n\n\n\nNote\n\n\n\nAs of now, we do not truncate nor drop samples based on sequence_len as each arch has different ways to process non-text tokens. We are looking for help on this.\n\n\n\nMllama\nbase_model: meta-llama/Llama-3.2-11B-Vision-Instruct\n\nchat_template: llama3_2_vision\n\n\nLlama4\nbase_model: meta-llama/Llama-4-Scout-17B-16E-Instruct\n\nchat_template: llama4\n\n\nPixtral\nbase_model: mistralai/Pixtral-12B-2409\n\nchat_template: pixtral\n\n\nLlava-1.5\nbase_model: llava-hf/llava-1.5-7b-hf\n\nchat_template: llava\n\n\nMistral-Small-3.1\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install vision lib via pip install 'mistral-common[opencv]==1.8.5'\n\n\nbase_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503\n\n\nMistral-Small-4\nbase_model: mistralai/Mistral-Small-4-119B-2603\n\n\nMagistral-Small-2509\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install vision lib via pip install 'mistral-common[opencv]==1.8.5'\n\n\nbase_model: mistralai/Magistral-Small-2509\n\n\nVoxtral\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install audio lib via pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'\n\n\nbase_model: mistralai/Voxtral-Mini-3B-2507\n\nprocessor_type: VoxtralProcessor\n\n\nGemma-4\nAll Gemma 4 variants (E2B, E4B, 26B-A4B, 31B) load as multimodal models even for text-only training.\nbase_model: google/gemma-4-E2B-it  # or E4B-it, 26B-A4B, 31B\n\nchat_template: gemma4\nfreeze_mm_modules: true  # freeze vision/audio encoders for text-only or vision LoRA\n\n# For the 26B-A4B MoE model, enable ScatterMoE and expert LoRA:\nplugins:\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n  - axolotl.integrations.kernels.KernelsPlugin\nuse_kernels: true\nuse_scattermoe: true\nexperts_implementation: scattermoe\n\nlora_target_modules: 'model.language_model.layers.[\\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'\n\n# MoE expert LoRA (3D tensors, not nn.Linear) — only for 26B-A4B:\nlora_target_parameters:\n  - experts.gate_up_proj\n  - experts.down_proj\n\n\n\n\n\n\nWarning\n\n\n\nGemma 4 VLM training starts with high loss (~8-15). This is expected — see the training stability guide for details.\n\n\n\n\n\n\n\n\nTip\n\n\n\nFor DDP training, axolotl auto-detects Gemma4 and sets use_reentrant=False and ddp_find_unused_parameters=True. However, when activation_offloading: true, ddp_find_unused_parameters is skipped (checkpoint wrappers conflict with it); use freeze_mm_modules: true instead to handle unused vision/audio params. For FSDP2, use fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer.\n\n\n\n\nGemma-3\n\n\n\n\n\n\nTip\n\n\n\nThe Gemma3-1B model is a text-only model, so please train as regular text model.\n\n\nFor multi-modal 4B/12B/27B models, use the following config:\nbase_model: google/gemma-3-4b-it\n\nchat_template: gemma3\n\n\nGemma-3n\n\n\n\n\n\n\nWarning\n\n\n\nThe model’s initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers.\n\n\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install timm via pip3 install timm==1.0.17\n\n\nbase_model: google/gemma-3n-E2B-it\n\nchat_template: gemma3n\n\n\nQwen2-VL\nbase_model: Qwen/Qwen2-VL-7B-Instruct\n\nchat_template: qwen2_vl\n\n\nQwen2.5-VL\nbase_model: Qwen/Qwen2.5-VL-7B-Instruct\n\nchat_template: qwen2_vl  # same as qwen2-vl\n\n\nQwen3-VL\nbase_model: Qwen/Qwen3-VL-4B-Instruct\n\nchat_template: qwen2_vl  # same as qwen2-vl\n\n\nQwen3.5\nbase_model: Qwen/Qwen3.5-9B\n\nchat_template: qwen3_5\n\n\nGLM-4.6V\nBoth GLM-4.6V (106B MoE) and GLM-4.6V-Flash (9B) are supported.\n# GLM-4.6V (106B MoE version)\nbase_model: zai-org/GLM-4.6V\n\n# OR GLM-4.6V-Flash (9B version)\nbase_model: zai-org/GLM-4.6V-Flash\n\n\nSmolVLM2\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install num2words via pip3 install num2words==0.5.14\n\n\nbase_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct\n\n\nLFM2-VL\n\n\n\n\n\n\nWarning\n\n\n\nPlease uninstall causal-conv1d via pip3 uninstall -y causal-conv1d\n\n\nbase_model: LiquidAI/LFM2-VL-450M\n\n\nIntern-VL\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install timm via pip3 install timm==1.0.19\n\n\nbase_model: OpenGVLab/InternVL3_5-8B",
     "crumbs": [
-      "Getting Started",
-      "Quickstart"
+      "How To Guides",
+      "MultiModal / Vision Language Models (BETA)"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-understanding",
-    "href": "docs/getting-started.html#sec-understanding",
-    "title": "Quickstart",
-    "section": "2 Understanding the Process",
-    "text": "2 Understanding the Process\n\n2.1 The Configuration File\nThe YAML configuration file controls everything about your training. Here’s what (part of) our example config looks like:\nbase_model: NousResearch/Llama-3.2-1B\n\nload_in_8bit: true\nadapter: lora\n\ndatasets:\n  - path: teknium/GPT4-LLM-Cleaned\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.1\noutput_dir: ./outputs/lora-out\n\n\n\n\n\n\nTip\n\n\n\nload_in_8bit: true and adapter: lora enables LoRA adapter finetuning.\n\nTo perform Full finetuning, remove these two lines.\nTo perform QLoRA finetuning, replace with load_in_4bit: true and adapter: qlora.\n\n\n\nSee our config options for more details.\n\n\n2.2 Training\nWhen you run axolotl train, Axolotl:\n\nDownloads the base model\n(If specified) applies QLoRA/LoRA adapter layers\nLoads and processes the dataset\nRuns the training loop\nSaves the trained model and / or LoRA weights",
+    "objectID": "docs/multimodal.html#dataset-format",
+    "href": "docs/multimodal.html#dataset-format",
+    "title": "MultiModal / Vision Language Models (BETA)",
+    "section": "Dataset Format",
+    "text": "Dataset Format\nFor multi-modal datasets, we adopt an extended chat_template format similar to OpenAI’s Message format.\n\nA message is a list of role and content.\nrole can be system, user, assistant, etc.\ncontent is a list of type and (text, image, path, url, base64, or audio).\n\n\nImage\n\n\n\n\n\n\nNote\n\n\n\nFor backwards compatibility:\n\nIf the dataset has a images or image column of list[Image], it will be appended to the first content list as {\"type\": \"image\", \"image\": ...}. However, if the content already has a {\"type\": \"image\"} but no image key, it will be set the image key.\nIf content is a string, it will be converted to a list with type as text.\n\n\n\nFor image loading, you can use the following keys within content alongside \"type\": \"image\":\n\n\"path\": \"/path/to/image.jpg\"\n\"url\": \"https://example.com/image.jpg\"\n\"base64\": \"...\"\n\"image\": PIL.Image\n\n\n\nAudio\nFor audio loading, you can use the following keys within content alongside \"type\": \"audio\":\n\n\"path\": \"/path/to/audio.mp3\"\n\"url\": \"https://example.com/audio.mp3\"\n\"audio\": np.ndarray\n\n\n\n\n\n\n\nTip\n\n\n\nYou may need to install librosa via pip3 install librosa==0.11.0.\n\n\n\n\nVideo\n\n\n\n\n\n\nWarning\n\n\n\nThis is not well tested at the moment. We welcome contributors!\n\n\nFor video loading, you can use the following keys within content alongside \"type\": \"video\":\n\n\"path\": \"/path/to/video.mp4\"\n\"url\": \"https://example.com/video.mp4\"\n\"video\": np.ndarray | list[PIL.Image.Image] | torch.Tensor (or list of the aforementioned)\n\n\n\nExample\nHere is an example of a multi-modal dataset:\n[\n  {\n    \"messages\": [\n        {\n            \"role\": \"system\",\n            \"content\": [\n              {\"type\": \"text\", \"text\": \"You are a helpful assistant.\"}\n              ]\n        },\n        {\n            \"role\": \"user\",\n            \"content\": [\n                {\"type\": \"image\", \"url\": \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg\"},\n                {\"type\": \"text\", \"text\": \"Describe this image in detail.\"}\n            ]\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": [\n              {\"type\": \"text\", \"text\": \"The image is a bee.\"}\n            ]\n        }\n    ]\n  }\n]",
     "crumbs": [
-      "Getting Started",
-      "Quickstart"
+      "How To Guides",
+      "MultiModal / Vision Language Models (BETA)"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-custom",
-    "href": "docs/getting-started.html#sec-custom",
-    "title": "Quickstart",
-    "section": "3 Your First Custom Training",
-    "text": "3 Your First Custom Training\nLet’s modify the example for your own data:\n\nCreate a new config file my_training.yml:\n\nbase_model: NousResearch/Nous-Hermes-llama-1b-v1\n\nload_in_8bit: true\nadapter: lora\n\n# Training settings\nmicro_batch_size: 2\nnum_epochs: 3\nlearning_rate: 0.0003\n\n# Your dataset\ndatasets:\n  - path: my_data.jsonl        # Your local data file\n    type: alpaca               # Or other format\nThis specific config is for LoRA fine-tuning a model with instruction tuning data using\nthe alpaca dataset format, which has the following format:\n{\n    \"instruction\": \"Write a description of alpacas.\",\n    \"input\": \"\",\n    \"output\": \"Alpacas are domesticated South American camelids...\"\n}\nPlease see our Dataset Formats for more dataset formats and how to\nformat them.\n\nPrepare your JSONL data in the specified format (in this case, the expected alpaca\nformat):\n\n{\"instruction\": \"Classify this text\", \"input\": \"I love this!\", \"output\": \"positive\"}\n{\"instruction\": \"Classify this text\", \"input\": \"Not good at all\", \"output\": \"negative\"}\n\nRun the training:\n\naxolotl train my_training.yml",
+    "objectID": "docs/multimodal.html#faq",
+    "href": "docs/multimodal.html#faq",
+    "title": "MultiModal / Vision Language Models (BETA)",
+    "section": "FAQ",
+    "text": "FAQ\n\nPIL.UnidentifiedImageError: cannot identify image file ...\n\nPIL could not retrieve the file at url using requests. Please check for typo. One alternative reason is that the request is blocked by the server.",
     "crumbs": [
-      "Getting Started",
-      "Quickstart"
+      "How To Guides",
+      "MultiModal / Vision Language Models (BETA)"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-common-tasks",
-    "href": "docs/getting-started.html#sec-common-tasks",
-    "title": "Quickstart",
-    "section": "4 Common Tasks",
-    "text": "4 Common Tasks\n\n\n\n\n\n\nTip\n\n\n\nThe same yaml file is used for training, inference, and merging.\n\n\n\n4.1 Testing Your Model\nAfter training, test your model:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nMore details can be found in Inference.\n\n\n4.2 Using a UI\nLaunch a Gradio interface:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\" --gradio\n\n\n4.3 Preprocessing Data\nFor large datasets, preprocess first:\naxolotl preprocess my_training.yml\nPlease make sure to set dataset_prepared_path: in your config to set the path to save the prepared dataset.\nMore details can be found in Dataset Preprocessing.\n\n\n4.4 Merging LoRA weights\nTo merge the LoRA weights back into the base model, run:\naxolotl merge-lora my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nThe merged model will be saved in the {output_dir}/merged directory.\nMore details can be found in Merging LoRA weights.",
-    "crumbs": [
-      "Getting Started",
-      "Quickstart"
-    ]
-  },
-  {
-    "objectID": "docs/getting-started.html#sec-next-steps",
-    "href": "docs/getting-started.html#sec-next-steps",
-    "title": "Quickstart",
-    "section": "5 Next Steps",
-    "text": "5 Next Steps\nNow that you have the basics, explore these guides based on what you want to do:\nChoose your path:\n\nChoosing a Fine-Tuning Method — SFT vs LoRA vs QLoRA vs GRPO vs DPO, with hardware recommendations\n\nCore guides:\n\nDataset Loading — Loading datasets from various sources\nDataset Formats — Working with different data formats\nOptimizations — Flash attention, gradient checkpointing, sample packing\nTraining Stability & Debugging — Monitoring metrics, fixing NaN, OOM debugging\n\nAdvanced training methods:\n\nRLHF / Preference Learning — DPO, KTO, GRPO, EBFT\nGRPO Training — RL with custom rewards and vLLM generation\nvLLM Serving — Setting up vLLM for GRPO\n\nScaling up:\n\nMulti-GPU Training — DeepSpeed, FSDP, DDP\nMulti-Node Training — Distributed training across machines",
-    "crumbs": [
-      "Getting Started",
-      "Quickstart"
-    ]
-  },
-  {
-    "objectID": "docs/api/utils.schemas.enums.html",
-    "href": "docs/api/utils.schemas.enums.html",
-    "title": "utils.schemas.enums",
-    "section": "",
-    "text": "utils.schemas.enums\nEnums for Axolotl input config\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations"
-  },
-  {
-    "objectID": "docs/api/utils.schemas.enums.html#classes",
-    "href": "docs/api/utils.schemas.enums.html#classes",
-    "title": "utils.schemas.enums",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations"
-  },
-  {
-    "objectID": "docs/api/core.builders.base.html",
-    "href": "docs/api/core.builders.base.html",
-    "title": "core.builders.base",
-    "section": "",
-    "text": "core.builders.base\nBase class for trainer builder\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
-  },
-  {
-    "objectID": "docs/api/core.builders.base.html#classes",
-    "href": "docs/api/core.builders.base.html#classes",
-    "title": "core.builders.base",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
-  },
-  {
-    "objectID": "docs/api/logging_config.html",
-    "href": "docs/api/logging_config.html",
-    "title": "logging_config",
-    "section": "",
-    "text": "logging_config\nCommon logging module for axolotl.\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlLogger\nLogger that applies filtering to non-axolotl loggers.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nLogger that applies filtering to non-axolotl loggers.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\nINFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records\n(i.e. non-axolotl.INFO, DEBUG, etc. by default).\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
-  },
-  {
-    "objectID": "docs/api/logging_config.html#classes",
-    "href": "docs/api/logging_config.html#classes",
-    "title": "logging_config",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlLogger\nLogger that applies filtering to non-axolotl loggers.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nLogger that applies filtering to non-axolotl loggers.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at\nINFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records\n(i.e. non-axolotl.INFO, DEBUG, etc. by default).\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type"
-  },
-  {
-    "objectID": "docs/api/logging_config.html#functions",
-    "href": "docs/api/logging_config.html#functions",
-    "title": "logging_config",
+    "objectID": "docs/api/utils.schemas.multimodal.html",
+    "href": "docs/api/utils.schemas.multimodal.html",
+    "title": "utils.schemas.multimodal",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
+    "text": "utils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_instruct.html",
-    "href": "docs/api/prompt_strategies.alpaca_instruct.html",
-    "title": "prompt_strategies.alpaca_instruct",
+    "objectID": "docs/api/utils.schemas.multimodal.html#classes",
+    "href": "docs/api/utils.schemas.multimodal.html#classes",
+    "title": "utils.schemas.multimodal",
     "section": "",
-    "text": "prompt_strategies.alpaca_instruct\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class"
+    "text": "Name\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
   },
   {
-    "objectID": "docs/api/utils.callbacks.mlflow_.html",
-    "href": "docs/api/utils.callbacks.mlflow_.html",
-    "title": "utils.callbacks.mlflow_",
+    "objectID": "docs/api/core.chat.format.shared.html",
+    "href": "docs/api/core.chat.format.shared.html",
+    "title": "core.chat.format.shared",
     "section": "",
-    "text": "utils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
+    "text": "core.chat.format.shared\ncore.chat.format.shared\nshared functions for format transforms"
   },
   {
-    "objectID": "docs/api/utils.callbacks.mlflow_.html#classes",
-    "href": "docs/api/utils.callbacks.mlflow_.html#classes",
-    "title": "utils.callbacks.mlflow_",
+    "objectID": "docs/api/utils.samplers.multipack.html",
+    "href": "docs/api/utils.samplers.multipack.html",
+    "title": "utils.samplers.multipack",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
+    "text": "utils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\ninto fixed-capacity batches to optimize memory usage and training throughput.\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    bin_size,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
   },
   {
-    "objectID": "docs/api/cli.utils.html",
-    "href": "docs/api/cli.utils.html",
-    "title": "cli.utils",
+    "objectID": "docs/api/utils.samplers.multipack.html#classes",
+    "href": "docs/api/utils.samplers.multipack.html#classes",
+    "title": "utils.samplers.multipack",
     "section": "",
-    "text": "cli.utils\ncli.utils\nInit for axolotl.cli.utils module."
+    "text": "Name\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    bin_size,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs"
   },
   {
-    "objectID": "docs/api/loaders.patch_manager.html",
-    "href": "docs/api/loaders.patch_manager.html",
-    "title": "loaders.patch_manager",
+    "objectID": "docs/api/utils.samplers.multipack.html#functions",
+    "href": "docs/api/utils.samplers.multipack.html#functions",
+    "title": "utils.samplers.multipack",
     "section": "",
-    "text": "loaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\nApplies pre- and post-model load patches for various fixes and optimizations.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_build_patches\nApply patches right after model build, before post-load setup.\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_config_load_patches\nApply patches that must be set up before config loading.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\napply_pre_tokenizer_load_patches\nApply patches that must be set up before tokenizer loading.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_build_patches(model)\nApply patches right after model build, before post-load setup.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_config_load_patches(cfg)\nApply patches that must be set up before config loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoConfig.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired\n\n\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches(cfg)\nApply patches that must be set up before tokenizer loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoTokenizer.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
   },
   {
-    "objectID": "docs/api/loaders.patch_manager.html#classes",
-    "href": "docs/api/loaders.patch_manager.html#classes",
-    "title": "loaders.patch_manager",
+    "objectID": "docs/api/utils.schedulers.html",
+    "href": "docs/api/utils.schedulers.html",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_build_patches\nApply patches right after model build, before post-load setup.\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_config_load_patches\nApply patches that must be set up before config loading.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\napply_pre_tokenizer_load_patches\nApply patches that must be set up before tokenizer loading.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_build_patches(model)\nApply patches right after model build, before post-load setup.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_config_load_patches(cfg)\nApply patches that must be set up before config loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoConfig.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired\n\n\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches(cfg)\nApply patches that must be set up before tokenizer loading.\nThis is for patches that intercept remote code loading from HuggingFace,\nwhich needs to be in place before AutoTokenizer.from_pretrained() is called.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nConfiguration dictionary with model and training settings.\nrequired"
+    "text": "utils.schedulers\nModule for custom LRScheduler class\n\n\n\n\n\nName\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_state_dict\nRestore state, including inner_schedule.\n\n\nstate_dict\nReturn serializable state, saving inner_schedule as its own state_dict.\n\n\n\n\n\nutils.schedulers.JaggedLRRestartScheduler.load_state_dict(state_dict)\nRestore state, including inner_schedule.\n\n\n\nutils.schedulers.JaggedLRRestartScheduler.state_dict()\nReturn serializable state, saving inner_schedule as its own state_dict.\n\n\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
   },
   {
-    "objectID": "docs/api/core.datasets.chat.html",
-    "href": "docs/api/core.datasets.chat.html",
-    "title": "core.datasets.chat",
+    "objectID": "docs/api/utils.schedulers.html#classes",
+    "href": "docs/api/utils.schedulers.html#classes",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "core.datasets.chat\nchat dataset module\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
+    "text": "Name\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_state_dict\nRestore state, including inner_schedule.\n\n\nstate_dict\nReturn serializable state, saving inner_schedule as its own state_dict.\n\n\n\n\n\nutils.schedulers.JaggedLRRestartScheduler.load_state_dict(state_dict)\nRestore state, including inner_schedule.\n\n\n\nutils.schedulers.JaggedLRRestartScheduler.state_dict()\nReturn serializable state, saving inner_schedule as its own state_dict.\n\n\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0"
   },
   {
-    "objectID": "docs/api/core.datasets.chat.html#classes",
-    "href": "docs/api/core.datasets.chat.html#classes",
-    "title": "core.datasets.chat",
+    "objectID": "docs/api/utils.schedulers.html#functions",
+    "href": "docs/api/utils.schedulers.html#functions",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
+    "text": "Name\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.scheduler.html",
-    "href": "docs/api/core.trainers.mixins.scheduler.html",
-    "title": "core.trainers.mixins.scheduler",
+    "objectID": "docs/api/cli.main.html",
+    "href": "docs/api/cli.main.html",
+    "title": "cli.main",
     "section": "",
-    "text": "core.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
+    "text": "cli.main\nClick CLI definitions for various axolotl commands.\n\n\n\n\n\nName\nDescription\n\n\n\n\nagent_docs\nShow agent-optimized documentation.\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nconfig_schema\nDump the full config JSON schema.\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.agent_docs(topic, list_topics)\nShow agent-optimized documentation.\nPrints reference docs designed for AI coding agents.\nThese docs are bundled with the package — no network access needed.\n\b\nExamples:\naxolotl agent-docs # overview (start here)\naxolotl agent-docs grpo # GRPO reference\naxolotl agent-docs sft # SFT reference\naxolotl agent-docs –list # list all topics\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.config_schema(output_format, field)\nDump the full config JSON schema.\nUseful for AI agents and tooling to discover all available config options,\ntheir types, defaults, and descriptions.\n\b\nExamples:\naxolotl config-schema # full JSON schema\naxolotl config-schema –format yaml # YAML format\naxolotl config-schema –field adapter # single field\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n- docs: Full documentation (Quarto markdown files)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs, docs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.scheduler.html#classes",
-    "href": "docs/api/core.trainers.mixins.scheduler.html#classes",
-    "title": "core.trainers.mixins.scheduler",
+    "objectID": "docs/api/cli.main.html#functions",
+    "href": "docs/api/cli.main.html#functions",
+    "title": "cli.main",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
+    "text": "Name\nDescription\n\n\n\n\nagent_docs\nShow agent-optimized documentation.\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nconfig_schema\nDump the full config JSON schema.\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.agent_docs(topic, list_topics)\nShow agent-optimized documentation.\nPrints reference docs designed for AI coding agents.\nThese docs are bundled with the package — no network access needed.\n\b\nExamples:\naxolotl agent-docs # overview (start here)\naxolotl agent-docs grpo # GRPO reference\naxolotl agent-docs sft # SFT reference\naxolotl agent-docs –list # list all topics\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.config_schema(output_format, field)\nDump the full config JSON schema.\nUseful for AI agents and tooling to discover all available config options,\ntheir types, defaults, and descriptions.\n\b\nExamples:\naxolotl config-schema # full JSON schema\naxolotl config-schema –format yaml # YAML format\naxolotl config-schema –field adapter # single field\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n- docs: Full documentation (Quarto markdown files)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs, docs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
   },
   {
-    "objectID": "docs/api/common.datasets.html",
-    "href": "docs/api/common.datasets.html",
-    "title": "common.datasets",
+    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
+    "title": "monkeypatch.stablelm_attn_hijack_flash",
     "section": "",
-    "text": "common.datasets\nDataset loading utilities.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
+    "text": "monkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\n\n\n\nName\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
   },
   {
-    "objectID": "docs/api/common.datasets.html#classes",
-    "href": "docs/api/common.datasets.html#classes",
-    "title": "common.datasets",
+    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
+    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
+    "title": "monkeypatch.stablelm_attn_hijack_flash",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata."
+    "text": "Name\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
   },
   {
-    "objectID": "docs/api/common.datasets.html#functions",
-    "href": "docs/api/common.datasets.html#functions",
-    "title": "common.datasets",
+    "objectID": "docs/api/cli.utils.fetch.html",
+    "href": "docs/api/cli.utils.fetch.html",
+    "title": "cli.utils.fetch",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
+    "text": "cli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.chatml.html",
-    "href": "docs/api/prompt_strategies.kto.chatml.html",
-    "title": "prompt_strategies.kto.chatml",
+    "objectID": "docs/api/cli.utils.fetch.html#functions",
+    "href": "docs/api/cli.utils.fetch.html#functions",
+    "title": "cli.utils.fetch",
     "section": "",
-    "text": "prompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "Name\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.chatml.html#functions",
-    "href": "docs/api/prompt_strategies.kto.chatml.html#functions",
-    "title": "prompt_strategies.kto.chatml",
+    "objectID": "docs/api/loaders.adapter.html",
+    "href": "docs/api/loaders.adapter.html",
+    "title": "loaders.adapter",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "loaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
   },
   {
-    "objectID": "docs/api/prompt_strategies.user_defined.html",
-    "href": "docs/api/prompt_strategies.user_defined.html",
-    "title": "prompt_strategies.user_defined",
+    "objectID": "docs/api/loaders.adapter.html#functions",
+    "href": "docs/api/loaders.adapter.html#functions",
+    "title": "loaders.adapter",
     "section": "",
-    "text": "prompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\n\n\n\nName\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
+    "text": "Name\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
   },
   {
-    "objectID": "docs/api/prompt_strategies.user_defined.html#classes",
-    "href": "docs/api/prompt_strategies.user_defined.html#classes",
-    "title": "prompt_strategies.user_defined",
+    "objectID": "docs/api/prompt_strategies.llama2_chat.html",
+    "href": "docs/api/prompt_strategies.llama2_chat.html",
+    "title": "prompt_strategies.llama2_chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
+    "text": "prompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\nsee also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.\nThis implementation is based on the Vicuna PR and the fastchat repo, see also:\nhttps://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847\nUse dataset type: “llama2_chat” in conig.yml to use this prompt style.\nE.g. in the config.yml:\ndatasets:\n  - path: llama_finetune_train.jsonl\n    type: llama2_chat\nThe dataset itself should look like this:\n{'conversations':[{\"from\": \"human\", \"value\": \"Who are you?\"}, {\"from\": \"gpt\", \"value\": \"I am Vicuna\"},...]}\nin a jsonl file. The first message should be from the human, the second from gpt.\nFor a custom system message, the first “from” can be “system” (followed by alternating “human” and “gpt” turns).\nImportant: Don’t use “special_tokens:” in your config.yml if you are not sure what you are doing!\n\n\n\n\n\nName\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
   },
   {
-    "objectID": "docs/api/prompt_strategies.chat_template.html",
-    "href": "docs/api/prompt_strategies.chat_template.html",
-    "title": "prompt_strategies.chat_template",
+    "objectID": "docs/api/prompt_strategies.llama2_chat.html#classes",
+    "href": "docs/api/prompt_strategies.llama2_chat.html#classes",
+    "title": "prompt_strategies.llama2_chat",
     "section": "",
-    "text": "prompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n    real_last_index=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n    content_only=False,\n    reasoning_only=False,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncontent_only\nbool\nIf True and the turn has reasoning_content (template_thinking_key), preserve reasoning_content in the dummy turn so the diff only captures the content field boundaries. This is needed for correct training_detail alignment when reasoning_content is present.\nFalse\n\n\nreasoning_only\nbool\nIf True, preserve content in the dummy turn and replace reasoning_content with a dummy, so the diff only captures the reasoning_content field boundaries.\nFalse\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
+    "text": "Name\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
   },
   {
-    "objectID": "docs/api/prompt_strategies.chat_template.html#classes",
-    "href": "docs/api/prompt_strategies.chat_template.html#classes",
-    "title": "prompt_strategies.chat_template",
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html",
+    "title": "cli.merge_sharded_fsdp_weights",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n    real_last_index=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n    content_only=False,\n    reasoning_only=False,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncontent_only\nbool\nIf True and the turn has reasoning_content (template_thinking_key), preserve reasoning_content in the dummy turn so the diff only captures the content field boundaries. This is needed for correct training_detail alignment when reasoning_content is present.\nFalse\n\n\nreasoning_only\nbool\nIf True, preserve content in the dummy turn and replace reasoning_content with a dummy, so the diff only captures the reasoning_content field boundaries.\nFalse\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
+    "text": "cli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
   },
   {
-    "objectID": "docs/api/kernels.utils.html",
-    "href": "docs/api/kernels.utils.html",
-    "title": "kernels.utils",
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
+    "title": "cli.merge_sharded_fsdp_weights",
     "section": "",
-    "text": "kernels.utils\nkernels.utils\nUtilities for axolotl.kernels submodules."
+    "text": "Name\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading."
   },
   {
-    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
-    "title": "monkeypatch.mistral_attn_hijack_flash",
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
+    "title": "cli.merge_sharded_fsdp_weights",
     "section": "",
-    "text": "monkeypatch.mistral_attn_hijack_flash\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model"
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
   },
   {
-    "objectID": "docs/api/utils.schemas.config.html",
-    "href": "docs/api/utils.schemas.config.html",
-    "title": "utils.schemas.config",
+    "objectID": "docs/api/common.architectures.html",
+    "href": "docs/api/common.architectures.html",
+    "title": "common.architectures",
     "section": "",
-    "text": "utils.schemas.config\nModule with Pydantic models for configuration.\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nWrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\nEBFTConfig\nConfiguration for Energy-Based Fine-Tuning (EBFT)\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nWrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options.\n\n\n\nutils.schemas.config.EBFTConfig()\nConfiguration for Energy-Based Fine-Tuning (EBFT)"
+    "text": "common.architectures\ncommon.architectures\nCommon architecture specific constants"
   },
   {
-    "objectID": "docs/api/utils.schemas.config.html#classes",
-    "href": "docs/api/utils.schemas.config.html#classes",
-    "title": "utils.schemas.config",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
+    "title": "monkeypatch.llama_attn_hijack_flash",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nWrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\nEBFTConfig\nConfiguration for Energy-Based Fine-Tuning (EBFT)\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nWrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options.\n\n\n\nutils.schemas.config.EBFTConfig()\nConfiguration for Energy-Based Fine-Tuning (EBFT)"
+    "text": "monkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\n\n\n\nName\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
   },
   {
-    "objectID": "docs/api/utils.dict.html",
-    "href": "docs/api/utils.dict.html",
-    "title": "utils.dict",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
+    "title": "monkeypatch.llama_attn_hijack_flash",
     "section": "",
-    "text": "utils.dict\nModule containing the DictDefault class\n\n\n\n\n\nName\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
+    "text": "Name\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
   },
   {
-    "objectID": "docs/api/utils.dict.html#classes",
-    "href": "docs/api/utils.dict.html#classes",
-    "title": "utils.dict",
+    "objectID": "docs/api/core.trainers.utils.html",
+    "href": "docs/api/core.trainers.utils.html",
+    "title": "core.trainers.utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys."
+    "text": "core.trainers.utils\ncore.trainers.utils\nUtils for Axolotl trainers"
   },
   {
-    "objectID": "docs/api/utils.dict.html#functions",
-    "href": "docs/api/utils.dict.html#functions",
-    "title": "utils.dict",
+    "objectID": "docs/api/kernels.geglu.html",
+    "href": "docs/api/kernels.geglu.html",
+    "title": "kernels.geglu",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
+    "text": "kernels.geglu\nModule for definition of GEGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/loaders.constants.html",
-    "href": "docs/api/loaders.constants.html",
-    "title": "loaders.constants",
+    "objectID": "docs/api/kernels.geglu.html#functions",
+    "href": "docs/api/kernels.geglu.html#functions",
+    "title": "kernels.geglu",
     "section": "",
-    "text": "loaders.constants\nloaders.constants\nShared constants for axolotl.loaders module"
+    "text": "Name\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
-    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
-    "title": "monkeypatch.data.batch_dataset_fetcher",
+    "objectID": "docs/api/utils.collators.mamba.html",
+    "href": "docs/api/utils.collators.mamba.html",
+    "title": "utils.collators.mamba",
     "section": "",
-    "text": "monkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
+    "text": "utils.collators.mamba\ncollators for Mamba\n\n\n\n\n\nName\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
   },
   {
-    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
-    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
-    "title": "monkeypatch.data.batch_dataset_fetcher",
+    "objectID": "docs/api/utils.collators.mamba.html#classes",
+    "href": "docs/api/utils.collators.mamba.html#classes",
+    "title": "utils.collators.mamba",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
+    "text": "Name\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
   },
   {
-    "objectID": "docs/api/kernels.lora.html",
-    "href": "docs/api/kernels.lora.html",
-    "title": "kernels.lora",
+    "objectID": "docs/api/utils.quantization.html",
+    "href": "docs/api/utils.quantization.html",
+    "title": "utils.quantization",
     "section": "",
-    "text": "kernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\nSee “LoRA: Low-Rank Adaptation of Large Language Models”\n(https://arxiv.org/abs/2106.09685).\nAlso supports DoRA (Weight-Decomposed Low-Rank Adaptation):\nSee “DoRA: Weight-Decomposed Low-Rank Adaptation” (https://arxiv.org/abs/2402.09353).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoRA_Embedding\nFused LoRA embedding: F.embedding(x, W) + s * F.embedding(x, A^T) @ B^T.\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QK\nOptimized LoRA QK implementation for models where v_proj is None.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_Embedding()\nFused LoRA embedding: F.embedding(x, W) + s * F.embedding(x, A^T) @ B^T.\nSupports dropout and DoRA.\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\nSupports bias, dropout, and DoRA. Dropout is applied to the input for\ngate/up projections. The down projection uses hidden states (post-activation)\nas input, so dropout is not applied there.\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.LoRA_QK()\nOptimized LoRA QK implementation for models where v_proj is None.\nUsed by models like Gemma4 with attention_k_eq_v=True, where key states are\nreused as value states. Only Q and K projections are fused; the caller\nreturns K a second time as V so that autograd accumulates key+value gradients\ninto a single dK.\nSupports bias, dropout, and DoRA (Weight-Decomposed Low-Rank Adaptation).\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nSupports bias, dropout, and DoRA (Weight-Decomposed Low-Rank Adaptation).\nDropout is applied outside this Function so autograd handles its backward.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_embedding\nApplies LoRA to embedding layer.\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qk\nApplies LoRA to compute Query and Key projections for models where v_proj is None.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_embedding_lora_parameters\nExtract LoRA parameters from a PEFT Embedding module.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_embedding(self, x)\nApplies LoRA to embedding layer.\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_qk(self, X, inplace=True)\nApplies LoRA to compute Query and Key projections for models where v_proj is None.\nWhen v_proj is None (e.g. Gemma4 attention_k_eq_v), key states are reused as\nvalue states. Returns (Q, K, K) — the caller’s patched forward will use K as V.\nBecause K is returned twice, autograd accumulates gradients from both the key and\nvalue paths into dK before calling LoRA_QK.backward.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\nSupports bias, dropout, and DoRA. Dropout is applied outside the autograd\nFunction so PyTorch handles its backward automatically. A single shared\ndropout mask is used across Q, K, V projections for memory efficiency.\n\n\n\nkernels.lora.get_embedding_lora_parameters(embed)\nExtract LoRA parameters from a PEFT Embedding module.\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing:\n\n\n\ntorch.Tensor | None\n- W: base weight tensor\n\n\n\nQuantState | torch.Tensor | None\n- b: base layer bias (or None)\n\n\n\ntorch.Tensor | None\n- quant_state: quantization state (or None)\n\n\n\ntorch.Tensor | None\n- A: LoRA A weight (or None)\n\n\n\nfloat | None\n- B: LoRA B weight (or None)\n\n\n\ntorch.Tensor | None\n- s: LoRA scaling factor (or None)\n\n\n\nnn.Module | None\n- lora_bias: LoRA B bias (or None)\n\n\n\ntorch.Tensor | None\n- dropout: dropout module (or None)\n\n\n\ntuple[torch.Tensor, torch.Tensor | None, QuantState | torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, float | None, torch.Tensor | None, nn.Module | None, torch.Tensor | None]\n- magnitude: DoRA magnitude vector (or None)\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(\n    X,\n    W,\n    b,\n    W_quant,\n    A,\n    B,\n    s,\n    out=None,\n    X_drop=None,\n    lora_bias=None,\n)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | torch.Tensor | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\nX_drop\ntorch.Tensor | None\nOptional dropout-applied input for LoRA path (if None, uses X)\nNone\n\n\nlora_bias\ntorch.Tensor | None\nOptional LoRA B layer bias [out_features]\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + s * X_drop @ A @ B + b + s * lora_bias"
+    "text": "utils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_qat_model\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\nget_quantization_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model\nThis function is used to quantize a model.\n\n\n\n\n\nutils.quantization.convert_qat_model(model, quantize_embedding=False)\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\n\nutils.quantization.get_quantization_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
   },
   {
-    "objectID": "docs/api/kernels.lora.html#classes",
-    "href": "docs/api/kernels.lora.html#classes",
-    "title": "kernels.lora",
+    "objectID": "docs/api/utils.quantization.html#functions",
+    "href": "docs/api/utils.quantization.html#functions",
+    "title": "utils.quantization",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLoRA_Embedding\nFused LoRA embedding: F.embedding(x, W) + s * F.embedding(x, A^T) @ B^T.\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QK\nOptimized LoRA QK implementation for models where v_proj is None.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_Embedding()\nFused LoRA embedding: F.embedding(x, W) + s * F.embedding(x, A^T) @ B^T.\nSupports dropout and DoRA.\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\nSupports bias, dropout, and DoRA. Dropout is applied to the input for\ngate/up projections. The down projection uses hidden states (post-activation)\nas input, so dropout is not applied there.\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.LoRA_QK()\nOptimized LoRA QK implementation for models where v_proj is None.\nUsed by models like Gemma4 with attention_k_eq_v=True, where key states are\nreused as value states. Only Q and K projections are fused; the caller\nreturns K a second time as V so that autograd accumulates key+value gradients\ninto a single dK.\nSupports bias, dropout, and DoRA (Weight-Decomposed Low-Rank Adaptation).\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nSupports bias, dropout, and DoRA (Weight-Decomposed Low-Rank Adaptation).\nDropout is applied outside this Function so autograd handles its backward."
+    "text": "Name\nDescription\n\n\n\n\nconvert_qat_model\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\nget_quantization_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model\nThis function is used to quantize a model.\n\n\n\n\n\nutils.quantization.convert_qat_model(model, quantize_embedding=False)\nThis function converts a QAT model which has fake quantized layers back to the original model.\n\n\n\nutils.quantization.get_quantization_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchAOQuantDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchAOQuantDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
   },
   {
-    "objectID": "docs/api/kernels.lora.html#functions",
-    "href": "docs/api/kernels.lora.html#functions",
-    "title": "kernels.lora",
+    "objectID": "docs/api/cli.vllm_serve.html",
+    "href": "docs/api/cli.vllm_serve.html",
+    "title": "cli.vllm_serve",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_lora_embedding\nApplies LoRA to embedding layer.\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qk\nApplies LoRA to compute Query and Key projections for models where v_proj is None.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_embedding_lora_parameters\nExtract LoRA parameters from a PEFT Embedding module.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_embedding(self, x)\nApplies LoRA to embedding layer.\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_qk(self, X, inplace=True)\nApplies LoRA to compute Query and Key projections for models where v_proj is None.\nWhen v_proj is None (e.g. Gemma4 attention_k_eq_v), key states are reused as\nvalue states. Returns (Q, K, K) — the caller’s patched forward will use K as V.\nBecause K is returned twice, autograd accumulates gradients from both the key and\nvalue paths into dK before calling LoRA_QK.backward.\nSupports bias, dropout, and DoRA.\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\nSupports bias, dropout, and DoRA. Dropout is applied outside the autograd\nFunction so PyTorch handles its backward automatically. A single shared\ndropout mask is used across Q, K, V projections for memory efficiency.\n\n\n\nkernels.lora.get_embedding_lora_parameters(embed)\nExtract LoRA parameters from a PEFT Embedding module.\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing:\n\n\n\ntorch.Tensor | None\n- W: base weight tensor\n\n\n\nQuantState | torch.Tensor | None\n- b: base layer bias (or None)\n\n\n\ntorch.Tensor | None\n- quant_state: quantization state (or None)\n\n\n\ntorch.Tensor | None\n- A: LoRA A weight (or None)\n\n\n\nfloat | None\n- B: LoRA B weight (or None)\n\n\n\ntorch.Tensor | None\n- s: LoRA scaling factor (or None)\n\n\n\nnn.Module | None\n- lora_bias: LoRA B bias (or None)\n\n\n\ntorch.Tensor | None\n- dropout: dropout module (or None)\n\n\n\ntuple[torch.Tensor, torch.Tensor | None, QuantState | torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, float | None, torch.Tensor | None, nn.Module | None, torch.Tensor | None]\n- magnitude: DoRA magnitude vector (or None)\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(\n    X,\n    W,\n    b,\n    W_quant,\n    A,\n    B,\n    s,\n    out=None,\n    X_drop=None,\n    lora_bias=None,\n)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | torch.Tensor | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\nX_drop\ntorch.Tensor | None\nOptional dropout-applied input for LoRA path (if None, uses X)\nNone\n\n\nlora_bias\ntorch.Tensor | None\nOptional LoRA B layer bias [out_features]\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + s * X_drop @ A @ B + b + s * lora_bias"
+    "text": "cli.vllm_serve\nCLI to start the vllm server for online RL\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
   },
   {
-    "objectID": "docs/api/datasets.html",
-    "href": "docs/api/datasets.html",
-    "title": "datasets",
+    "objectID": "docs/api/cli.vllm_serve.html#classes",
+    "href": "docs/api/cli.vllm_serve.html#classes",
+    "title": "cli.vllm_serve",
     "section": "",
-    "text": "datasets\nModule containing dataset functionality.\nWe want this to be a wrapper for an existing dataset that we have loaded. Lets use the\nconcept of middlewares to wrap each dataset. We’ll use the collators later on to pad the\ndatasets.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server"
   },
   {
-    "objectID": "docs/api/datasets.html#classes",
-    "href": "docs/api/datasets.html#classes",
-    "title": "datasets",
+    "objectID": "docs/api/cli.vllm_serve.html#functions",
+    "href": "docs/api/cli.vllm_serve.html#functions",
+    "title": "cli.vllm_serve",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
+    "text": "Name\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
   },
   {
-    "objectID": "docs/api/common.const.html",
-    "href": "docs/api/common.const.html",
-    "title": "common.const",
+    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html",
+    "href": "docs/api/utils.ctx_managers.sequence_parallel.html",
+    "title": "utils.ctx_managers.sequence_parallel",
     "section": "",
-    "text": "common.const\ncommon.const\nVarious shared constants"
+    "text": "utils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.trainer.html",
-    "href": "docs/api/core.trainers.grpo.trainer.html",
-    "title": "core.trainers.grpo.trainer",
+    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
+    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
+    "title": "utils.ctx_managers.sequence_parallel",
     "section": "",
-    "text": "core.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlAsyncGRPOTrainer\nExtend AsyncGRPOTrainer with axolotl helpers\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlAsyncGRPOTrainer(*args, **kwargs)\nExtend AsyncGRPOTrainer with axolotl helpers\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
+    "text": "Name\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired"
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.trainer.html#classes",
-    "href": "docs/api/core.trainers.grpo.trainer.html#classes",
-    "title": "core.trainers.grpo.trainer",
+    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
+    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
+    "title": "utils.ctx_managers.sequence_parallel",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlAsyncGRPOTrainer\nExtend AsyncGRPOTrainer with axolotl helpers\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlAsyncGRPOTrainer(*args, **kwargs)\nExtend AsyncGRPOTrainer with axolotl helpers\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
+    "text": "Name\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
   },
   {
-    "objectID": "docs/api/kernels.swiglu.html",
-    "href": "docs/api/kernels.swiglu.html",
-    "title": "kernels.swiglu",
+    "objectID": "docs/api/utils.schemas.peft.html",
+    "href": "docs/api/utils.schemas.peft.html",
+    "title": "utils.schemas.peft",
     "section": "",
-    "text": "kernels.swiglu\nModule for definition of SwiGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "utils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
   },
   {
-    "objectID": "docs/api/kernels.swiglu.html#functions",
-    "href": "docs/api/kernels.swiglu.html#functions",
-    "title": "kernels.swiglu",
+    "objectID": "docs/api/utils.schemas.peft.html#classes",
+    "href": "docs/api/utils.schemas.peft.html#classes",
+    "title": "utils.schemas.peft",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "Name\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
   },
   {
-    "objectID": "docs/api/cli.utils.load.html",
-    "href": "docs/api/cli.utils.load.html",
-    "title": "cli.utils.load",
+    "objectID": "docs/api/integrations.kd.trainer.html",
+    "href": "docs/api/integrations.kd.trainer.html",
+    "title": "integrations.kd.trainer",
     "section": "",
-    "text": "cli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
+    "text": "integrations.kd.trainer\nKD trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
   },
   {
-    "objectID": "docs/api/cli.utils.load.html#functions",
-    "href": "docs/api/cli.utils.load.html#functions",
-    "title": "cli.utils.load",
+    "objectID": "docs/api/integrations.kd.trainer.html#classes",
+    "href": "docs/api/integrations.kd.trainer.html#classes",
+    "title": "integrations.kd.trainer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
+    "text": "Name\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.passthrough.html",
-    "href": "docs/api/prompt_strategies.dpo.passthrough.html",
-    "title": "prompt_strategies.dpo.passthrough",
+    "objectID": "docs/api/utils.model_shard_quant.html",
+    "href": "docs/api/utils.model_shard_quant.html",
+    "title": "utils.model_shard_quant",
     "section": "",
-    "text": "prompt_strategies.dpo.passthrough\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy"
+    "text": "utils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
   },
   {
-    "objectID": "docs/api/cli.delinearize_llama4.html",
-    "href": "docs/api/cli.delinearize_llama4.html",
-    "title": "cli.delinearize_llama4",
+    "objectID": "docs/api/utils.model_shard_quant.html#functions",
+    "href": "docs/api/utils.model_shard_quant.html#functions",
+    "title": "utils.model_shard_quant",
     "section": "",
-    "text": "cli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
   },
   {
-    "objectID": "docs/api/cli.delinearize_llama4.html#functions",
-    "href": "docs/api/cli.delinearize_llama4.html#functions",
-    "title": "cli.delinearize_llama4",
+    "objectID": "docs/api/utils.collators.batching.html",
+    "href": "docs/api/utils.collators.batching.html",
+    "title": "utils.collators.batching",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
+    "text": "utils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\n\n\n\nName\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
   },
   {
-    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html",
-    "href": "docs/api/prompt_strategies.stepwise_supervised.html",
-    "title": "prompt_strategies.stepwise_supervised",
+    "objectID": "docs/api/utils.collators.batching.html#classes",
+    "href": "docs/api/utils.collators.batching.html#classes",
+    "title": "utils.collators.batching",
     "section": "",
-    "text": "prompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\nand (optionally) per-step, or per-prompt-trace labels for reward modelling.\n\n\n\n\n\nName\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
+    "text": "Name\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
   },
   {
-    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
-    "href": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
-    "title": "prompt_strategies.stepwise_supervised",
+    "objectID": "docs/api/utils.schemas.trl.html",
+    "href": "docs/api/utils.schemas.trl.html",
+    "title": "utils.schemas.trl",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
+    "text": "utils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
   },
   {
-    "objectID": "docs/api/utils.freeze.html",
-    "href": "docs/api/utils.freeze.html",
-    "title": "utils.freeze",
+    "objectID": "docs/api/utils.schemas.trl.html#classes",
+    "href": "docs/api/utils.schemas.trl.html#classes",
+    "title": "utils.schemas.trl",
     "section": "",
-    "text": "utils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\n\n\n\nName\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\nfreeze_mm_modules\nFreeze all vision/audio/multimodal-projector parameters.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place.\n\n\n\nutils.freeze.freeze_mm_modules(model)\nFreeze all vision/audio/multimodal-projector parameters.\nIterates over model.named_parameters() and sets requires_grad = False\nfor any parameter whose name contains a known vision/audio module prefix.\nThis is useful when fine-tuning only the language backbone of a multimodal\nmodel and avoids the need for ddp_find_unused_parameters=True."
+    "text": "Name\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
   },
   {
-    "objectID": "docs/api/utils.freeze.html#classes",
-    "href": "docs/api/utils.freeze.html#classes",
-    "title": "utils.freeze",
+    "objectID": "docs/api/core.builders.rl.html",
+    "href": "docs/api/core.builders.rl.html",
+    "title": "core.builders.rl",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise."
+    "text": "core.builders.rl\nBuilder for RLHF trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
   },
   {
-    "objectID": "docs/api/utils.freeze.html#functions",
-    "href": "docs/api/utils.freeze.html#functions",
-    "title": "utils.freeze",
+    "objectID": "docs/api/core.builders.rl.html#classes",
+    "href": "docs/api/core.builders.rl.html#classes",
+    "title": "core.builders.rl",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\nfreeze_mm_modules\nFreeze all vision/audio/multimodal-projector parameters.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place.\n\n\n\nutils.freeze.freeze_mm_modules(model)\nFreeze all vision/audio/multimodal-projector parameters.\nIterates over model.named_parameters() and sets requires_grad = False\nfor any parameter whose name contains a known vision/audio module prefix.\nThis is useful when fine-tuning only the language backbone of a multimodal\nmodel and avoids the need for ddp_find_unused_parameters=True."
+    "text": "Name\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
   },
   {
-    "objectID": "docs/api/cli.train.html",
-    "href": "docs/api/cli.train.html",
-    "title": "cli.train",
+    "objectID": "docs/api/prompt_strategies.alpaca_chat.html",
+    "href": "docs/api/prompt_strategies.alpaca_chat.html",
+    "title": "prompt_strategies.alpaca_chat",
     "section": "",
-    "text": "cli.train\nCLI to run training on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
+    "text": "prompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
   },
   {
-    "objectID": "docs/api/cli.train.html#functions",
-    "href": "docs/api/cli.train.html#functions",
-    "title": "cli.train",
+    "objectID": "docs/api/prompt_strategies.alpaca_chat.html#classes",
+    "href": "docs/api/prompt_strategies.alpaca_chat.html#classes",
+    "title": "prompt_strategies.alpaca_chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
   },
   {
-    "objectID": "docs/api/models.mamba.modeling_mamba.html",
-    "href": "docs/api/models.mamba.modeling_mamba.html",
-    "title": "models.mamba.modeling_mamba",
+    "objectID": "docs/api/core.trainers.trl.html",
+    "href": "docs/api/core.trainers.trl.html",
+    "title": "core.trainers.trl",
     "section": "",
-    "text": "models.mamba.modeling_mamba\nmodels.mamba.modeling_mamba"
+    "text": "core.trainers.trl\nModule for TRL RL trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html",
-    "href": "docs/api/prompt_strategies.alpaca_w_system.html",
-    "title": "prompt_strategies.alpaca_w_system",
+    "objectID": "docs/api/core.trainers.trl.html#classes",
+    "href": "docs/api/core.trainers.trl.html#classes",
+    "title": "core.trainers.trl",
     "section": "",
-    "text": "prompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\n\n\n\nName\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
-    "href": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
-    "title": "prompt_strategies.alpaca_w_system",
+    "objectID": "docs/api/integrations.grokfast.optimizer.html",
+    "href": "docs/api/integrations.grokfast.optimizer.html",
+    "title": "integrations.grokfast.optimizer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
+    "text": "integrations.grokfast.optimizer\nintegrations.grokfast.optimizer"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.user_defined.html",
-    "href": "docs/api/prompt_strategies.kto.user_defined.html",
-    "title": "prompt_strategies.kto.user_defined",
+    "objectID": "docs/api/monkeypatch.multipack.html",
+    "href": "docs/api/monkeypatch.multipack.html",
+    "title": "monkeypatch.multipack",
     "section": "",
-    "text": "prompt_strategies.kto.user_defined\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies"
+    "text": "monkeypatch.multipack\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing"
   },
   {
-    "objectID": "docs/api/integrations.base.html",
-    "href": "docs/api/integrations.base.html",
-    "title": "integrations.base",
+    "objectID": "docs/api/cli.config.html",
+    "href": "docs/api/cli.config.html",
+    "title": "cli.config",
     "section": "",
-    "text": "integrations.base\nBase class for all plugins.\nA plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.\nPlugins can be used to integrate third-party models, modify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\non_rollouts_scored\nCalled after rollouts are scored during online RL (GRPO/PPO).\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntype[Trainer] | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.on_rollouts_scored(\n    cfg,\n    trainer,\n    prompts,\n    completions,\n    rewards,\n    advantages,\n)\nCalled after rollouts are scored during online RL (GRPO/PPO).\nProvides access to the full scored rollout data for logging, trace\nstorage, or analysis. Called once per scoring step with all samples\nfrom that step.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\ntrainer\n\nThe trainer instance.\nrequired\n\n\nprompts\nlist[str]\nList of prompt texts (one per sample).\nrequired\n\n\ncompletions\nlist[str]\nList of completion texts (one per sample).\nrequired\n\n\nrewards\ndict[str, list[float]]\nDict mapping reward function name to list of reward values.\nrequired\n\n\nadvantages\nlist[float]\nList of advantage values (one per sample).\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\non_rollouts_scored\nCalls the on_rollouts_scored method of all registered plugins.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.on_rollouts_scored(\n    cfg,\n    trainer,\n    prompts,\n    completions,\n    rewards,\n    advantages,\n)\nCalls the on_rollouts_scored method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\n\nThe trainer instance.\nrequired\n\n\nprompts\nlist[str]\nList of prompt texts.\nrequired\n\n\ncompletions\nlist[str]\nList of completion texts.\nrequired\n\n\nrewards\ndict[str, list[float]]\nDict mapping reward function name to list of rewards.\nrequired\n\n\nadvantages\nlist[float]\nList of advantage values.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported.\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
+    "text": "cli.config\nConfiguration loading and processing.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/integrations.base.html#classes",
-    "href": "docs/api/integrations.base.html#classes",
-    "title": "integrations.base",
+    "objectID": "docs/api/cli.config.html#functions",
+    "href": "docs/api/cli.config.html#functions",
+    "title": "cli.config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\non_rollouts_scored\nCalled after rollouts are scored during online RL (GRPO/PPO).\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntype[Trainer] | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.on_rollouts_scored(\n    cfg,\n    trainer,\n    prompts,\n    completions,\n    rewards,\n    advantages,\n)\nCalled after rollouts are scored during online RL (GRPO/PPO).\nProvides access to the full scored rollout data for logging, trace\nstorage, or analysis. Called once per scoring step with all samples\nfrom that step.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\ntrainer\n\nThe trainer instance.\nrequired\n\n\nprompts\nlist[str]\nList of prompt texts (one per sample).\nrequired\n\n\ncompletions\nlist[str]\nList of completion texts (one per sample).\nrequired\n\n\nrewards\ndict[str, list[float]]\nDict mapping reward function name to list of reward values.\nrequired\n\n\nadvantages\nlist[float]\nList of advantage values (one per sample).\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\non_rollouts_scored\nCalls the on_rollouts_scored method of all registered plugins.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.on_rollouts_scored(\n    cfg,\n    trainer,\n    prompts,\n    completions,\n    rewards,\n    advantages,\n)\nCalls the on_rollouts_scored method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\n\nThe trainer instance.\nrequired\n\n\nprompts\nlist[str]\nList of prompt texts.\nrequired\n\n\ncompletions\nlist[str]\nList of completion texts.\nrequired\n\n\nrewards\ndict[str, list[float]]\nDict mapping reward function name to list of rewards.\nrequired\n\n\nadvantages\nlist[float]\nList of advantage values.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
+    "text": "Name\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/integrations.base.html#functions",
-    "href": "docs/api/integrations.base.html#functions",
-    "title": "integrations.base",
+    "objectID": "docs/api/utils.collators.mm_chat.html",
+    "href": "docs/api/utils.collators.mm_chat.html",
+    "title": "utils.collators.mm_chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
+    "text": "utils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
   },
   {
-    "objectID": "docs/api/utils.callbacks.perplexity.html",
-    "href": "docs/api/utils.callbacks.perplexity.html",
-    "title": "utils.callbacks.perplexity",
+    "objectID": "docs/api/utils.collators.mm_chat.html#classes",
+    "href": "docs/api/utils.collators.mm_chat.html#classes",
+    "title": "utils.collators.mm_chat",
     "section": "",
-    "text": "utils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
+    "text": "Name\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
   },
   {
-    "objectID": "docs/api/utils.callbacks.perplexity.html#classes",
-    "href": "docs/api/utils.callbacks.perplexity.html#classes",
-    "title": "utils.callbacks.perplexity",
+    "objectID": "docs/api/cli.cloud.modal_.html",
+    "href": "docs/api/cli.cloud.modal_.html",
+    "title": "cli.cloud.modal_",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
+    "text": "cli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\nName\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
   },
   {
-    "objectID": "docs/api/loaders.tokenizer.html",
-    "href": "docs/api/loaders.tokenizer.html",
-    "title": "loaders.tokenizer",
+    "objectID": "docs/api/cli.cloud.modal_.html#classes",
+    "href": "docs/api/cli.cloud.modal_.html#classes",
+    "title": "cli.cloud.modal_",
     "section": "",
-    "text": "loaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n    revision='main',\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\nrevision\nstr\nModel revision/branch/tag/commit to load from (HF Hub)\n'main'\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
+    "text": "Name\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation."
   },
   {
-    "objectID": "docs/api/loaders.tokenizer.html#functions",
-    "href": "docs/api/loaders.tokenizer.html#functions",
-    "title": "loaders.tokenizer",
+    "objectID": "docs/api/cli.cloud.modal_.html#functions",
+    "href": "docs/api/cli.cloud.modal_.html#functions",
+    "title": "cli.cloud.modal_",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n    revision='main',\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\nrevision\nstr\nModel revision/branch/tag/commit to load from (HF Hub)\n'main'\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
+    "text": "Name\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
   },
   {
-    "objectID": "docs/api/core.datasets.transforms.chat_builder.html",
-    "href": "docs/api/core.datasets.transforms.chat_builder.html",
-    "title": "core.datasets.transforms.chat_builder",
+    "objectID": "docs/api/utils.data.streaming.html",
+    "href": "docs/api/utils.data.streaming.html",
+    "title": "utils.data.streaming",
     "section": "",
-    "text": "core.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\ndataset and converts it to a Chat.\n\n\n\n\n\nName\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='messages',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “messages”.\n'messages'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
+    "text": "utils.data.streaming\nutils.data.streaming\nData handling specific to streaming datasets."
   },
   {
-    "objectID": "docs/api/core.datasets.transforms.chat_builder.html#functions",
-    "href": "docs/api/core.datasets.transforms.chat_builder.html#functions",
-    "title": "core.datasets.transforms.chat_builder",
+    "objectID": "docs/api/prompt_strategies.input_output.html",
+    "href": "docs/api/prompt_strategies.input_output.html",
+    "title": "prompt_strategies.input_output",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='messages',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “messages”.\n'messages'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
+    "text": "prompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\n\n\n\nName\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.chat_template.html",
-    "href": "docs/api/prompt_strategies.dpo.chat_template.html",
-    "title": "prompt_strategies.dpo.chat_template",
+    "objectID": "docs/api/prompt_strategies.input_output.html#classes",
+    "href": "docs/api/prompt_strategies.input_output.html#classes",
+    "title": "prompt_strategies.input_output",
     "section": "",
-    "text": "prompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nDPO chat template strategy for argilla-style datasets.\n\n\n\n\n\nprompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)\nDPO chat template strategy for argilla-style datasets.\nFor argilla-style datasets where chosen/rejected contain full conversations\ninstead of single response messages. Extracts the conversation history from\nthe chosen field and formats both chosen/rejected responses using the\nconfigured chat template.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nConfiguration object containing chat_template and dataset settings\nrequired\n\n\ndataset_idx\n\nIndex of the dataset in the config (default: 0)\n0\n\n\n**kwargs\n\nAdditional keyword arguments (unused)\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ntuple\n\n(transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with ‘remove_columns’ specifying columns to drop\n\n\n\n\n\n\n{\n“chosen”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n],\n“rejected”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n]\n}"
+    "text": "Name\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.chat_template.html#functions",
-    "href": "docs/api/prompt_strategies.dpo.chat_template.html#functions",
-    "title": "prompt_strategies.dpo.chat_template",
+    "objectID": "docs/api/cli.inference.html",
+    "href": "docs/api/cli.inference.html",
+    "title": "cli.inference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nDPO chat template strategy for argilla-style datasets.\n\n\n\n\n\nprompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)\nDPO chat template strategy for argilla-style datasets.\nFor argilla-style datasets where chosen/rejected contain full conversations\ninstead of single response messages. Extracts the conversation history from\nthe chosen field and formats both chosen/rejected responses using the\nconfigured chat template.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nConfiguration object containing chat_template and dataset settings\nrequired\n\n\ndataset_idx\n\nIndex of the dataset in the config (default: 0)\n0\n\n\n**kwargs\n\nAdditional keyword arguments (unused)\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ntuple\n\n(transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with ‘remove_columns’ specifying columns to drop\n\n\n\n\n\n\n{\n“chosen”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n],\n“rejected”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n]\n}"
+    "text": "cli.inference\nCLI to run inference on a trained model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat\ntemplate is (optionally) applied, and the model specified in the axolotl config is\nused to generate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
   },
   {
-    "objectID": "docs/api/loaders.processor.html",
-    "href": "docs/api/loaders.processor.html",
-    "title": "loaders.processor",
+    "objectID": "docs/api/cli.inference.html#functions",
+    "href": "docs/api/cli.inference.html#functions",
+    "title": "cli.inference",
     "section": "",
-    "text": "loaders.processor\nloaders.processor\nProcessor loading functionality for multi-modal models"
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat\ntemplate is (optionally) applied, and the model specified in the axolotl config is\nused to generate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
   },
   {
-    "objectID": "docs/api/utils.tokenization.html",
-    "href": "docs/api/utils.tokenization.html",
-    "title": "utils.tokenization",
+    "objectID": "docs/api/prompt_strategies.orcamini.html",
+    "href": "docs/api/prompt_strategies.orcamini.html",
+    "title": "prompt_strategies.orcamini",
     "section": "",
-    "text": "utils.tokenization\nModule for tokenization utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
+    "text": "prompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\nsee also https://huggingface.co/psmathur/orca_mini_v2_7b for more information\nUse dataset type: orcamini in conig.yml to use this prompt style.\nCompared to the alpaca_w_system.open_orca dataset type,\nthis one specifies the system prompt with “### System:”.\nNot suited/tested for multiple-turn conversations without further adjustments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
   },
   {
-    "objectID": "docs/api/utils.tokenization.html#functions",
-    "href": "docs/api/utils.tokenization.html#functions",
-    "title": "utils.tokenization",
+    "objectID": "docs/api/prompt_strategies.orcamini.html#classes",
+    "href": "docs/api/prompt_strategies.orcamini.html#classes",
+    "title": "prompt_strategies.orcamini",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
+    "text": "Name\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
   },
   {
-    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html",
-    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html",
-    "title": "monkeypatch.trainer_fsdp_optim",
+    "objectID": "docs/api/train.html",
+    "href": "docs/api/train.html",
+    "title": "train",
     "section": "",
-    "text": "monkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
+    "text": "train\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
   },
   {
-    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
-    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
-    "title": "monkeypatch.trainer_fsdp_optim",
+    "objectID": "docs/api/train.html#functions",
+    "href": "docs/api/train.html#functions",
+    "title": "train",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
+    "text": "Name\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
   },
   {
-    "objectID": "docs/api/utils.data.sft.html",
-    "href": "docs/api/utils.data.sft.html",
-    "title": "utils.data.sft",
+    "objectID": "docs/api/core.trainers.mamba.html",
+    "href": "docs/api/core.trainers.mamba.html",
+    "title": "core.trainers.mamba",
     "section": "",
-    "text": "utils.data.sft\nData handling specific to SFT.\n\n\n\n\n\nName\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
+    "text": "core.trainers.mamba\nModule for mamba trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
   },
   {
-    "objectID": "docs/api/utils.data.sft.html#functions",
-    "href": "docs/api/utils.data.sft.html#functions",
-    "title": "utils.data.sft",
+    "objectID": "docs/api/core.trainers.mamba.html#classes",
+    "href": "docs/api/core.trainers.mamba.html#classes",
+    "title": "core.trainers.mamba",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
+    "text": "Name\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.user_defined.html",
-    "href": "docs/api/prompt_strategies.dpo.user_defined.html",
-    "title": "prompt_strategies.dpo.user_defined",
+    "objectID": "docs/api/integrations.lm_eval.args.html",
+    "href": "docs/api/integrations.lm_eval.args.html",
+    "title": "integrations.lm_eval.args",
     "section": "",
-    "text": "prompt_strategies.dpo.user_defined\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies"
+    "text": "integrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
   },
   {
-    "objectID": "docs/api/integrations.spectrum.args.html",
-    "href": "docs/api/integrations.spectrum.args.html",
-    "title": "integrations.spectrum.args",
+    "objectID": "docs/api/integrations.lm_eval.args.html#classes",
+    "href": "docs/api/integrations.lm_eval.args.html#classes",
+    "title": "integrations.lm_eval.args",
     "section": "",
-    "text": "integrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
+    "text": "Name\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
   },
   {
-    "objectID": "docs/api/integrations.spectrum.args.html#classes",
-    "href": "docs/api/integrations.spectrum.args.html#classes",
-    "title": "integrations.spectrum.args",
+    "objectID": "docs/api/core.chat.messages.html",
+    "href": "docs/api/core.chat.messages.html",
+    "title": "core.chat.messages",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
+    "text": "core.chat.messages\ninternal message representations of chat messages\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
   },
   {
-    "objectID": "docs/api/integrations.cut_cross_entropy.args.html",
-    "href": "docs/api/integrations.cut_cross_entropy.args.html",
-    "title": "integrations.cut_cross_entropy.args",
+    "objectID": "docs/api/core.chat.messages.html#classes",
+    "href": "docs/api/core.chat.messages.html#classes",
+    "title": "core.chat.messages",
     "section": "",
-    "text": "integrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
+    "text": "Name\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
   },
   {
-    "objectID": "docs/api/integrations.cut_cross_entropy.args.html#classes",
-    "href": "docs/api/integrations.cut_cross_entropy.args.html#classes",
-    "title": "integrations.cut_cross_entropy.args",
+    "objectID": "docs/api/prompt_strategies.dpo.zephyr.html",
+    "href": "docs/api/prompt_strategies.dpo.zephyr.html",
+    "title": "prompt_strategies.dpo.zephyr",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
+    "text": "prompt_strategies.dpo.zephyr\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr"
   },
   {
-    "objectID": "docs/api/monkeypatch.mixtral.html",
-    "href": "docs/api/monkeypatch.mixtral.html",
-    "title": "monkeypatch.mixtral",
+    "objectID": "docs/api/core.trainers.base.html",
+    "href": "docs/api/core.trainers.base.html",
+    "title": "core.trainers.base",
     "section": "",
-    "text": "monkeypatch.mixtral\nmonkeypatch.mixtral\nPatches to support multipack for mixtral"
+    "text": "core.trainers.base\nModule for customized trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\nstore_metrics\nStore metrics with specified reduction type.\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.\n\n\n\ncore.trainers.base.AxolotlTrainer.store_metrics(\n    metrics,\n    train_eval='train',\n    reduction='mean',\n)\nStore metrics with specified reduction type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmetrics\ndict[str, float] | dict[str, tuple[int | float, str]]\nDictionary of metric names to values, or metric names to (value, reduction_type) tuples.\nrequired\n\n\ntrain_eval\nLiteral['train', 'eval']\nWhether this is for training or evaluation.\n'train'"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.llama3.html",
-    "href": "docs/api/prompt_strategies.dpo.llama3.html",
-    "title": "prompt_strategies.dpo.llama3",
+    "objectID": "docs/api/core.trainers.base.html#classes",
+    "href": "docs/api/core.trainers.base.html#classes",
+    "title": "core.trainers.base",
     "section": "",
-    "text": "prompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\nstore_metrics\nStore metrics with specified reduction type.\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details.\n\n\n\ncore.trainers.base.AxolotlTrainer.store_metrics(\n    metrics,\n    train_eval='train',\n    reduction='mean',\n)\nStore metrics with specified reduction type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmetrics\ndict[str, float] | dict[str, tuple[int | float, str]]\nDictionary of metric names to values, or metric names to (value, reduction_type) tuples.\nrequired\n\n\ntrain_eval\nLiteral['train', 'eval']\nWhether this is for training or evaluation.\n'train'"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.llama3.html#functions",
-    "href": "docs/api/prompt_strategies.dpo.llama3.html#functions",
-    "title": "prompt_strategies.dpo.llama3",
+    "objectID": "docs/api/utils.callbacks.qat.html",
+    "href": "docs/api/utils.callbacks.qat.html",
+    "title": "utils.callbacks.qat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "utils.callbacks.qat\nQAT Callback for HF Causal Trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
-    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
+    "objectID": "docs/api/utils.callbacks.qat.html#classes",
+    "href": "docs/api/utils.callbacks.qat.html#classes",
+    "title": "utils.callbacks.qat",
     "section": "",
-    "text": "monkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\n\n\n\nName\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
+    "text": "Name\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model."
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
-    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
+    "objectID": "docs/api/utils.callbacks.qat.html#functions",
+    "href": "docs/api/utils.callbacks.qat.html#functions",
+    "title": "utils.callbacks.qat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
+    "text": "Name\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
-    "title": "monkeypatch.llama_attn_hijack_xformers",
+    "objectID": "docs/api/utils.bench.html",
+    "href": "docs/api/utils.bench.html",
+    "title": "utils.bench",
     "section": "",
-    "text": "monkeypatch.llama_attn_hijack_xformers\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments"
+    "text": "utils.bench\nBenchmarking and measurement utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
   },
   {
-    "objectID": "docs/api/utils.trainer.html",
-    "href": "docs/api/utils.trainer.html",
-    "title": "utils.trainer",
+    "objectID": "docs/api/utils.bench.html#functions",
+    "href": "docs/api/utils.bench.html#functions",
+    "title": "utils.bench",
     "section": "",
-    "text": "utils.trainer\nModule containing the Trainer class and related functions\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\nfilter_sequences_by_length\nFilter sequences outside valid length range [min_sequence_len, sequence_len].\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.filter_sequences_by_length(\n    sample,\n    sequence_len=2048,\n    min_sequence_len=2,\n    raise_on_drop=False,\n)\nFilter sequences outside valid length range [min_sequence_len, sequence_len].\nDrops samples that are either too short (&lt; min_sequence_len) or too long (&gt; sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\nIf raise_on_drop is set, the code raises a ValueError if a sample is\nencountered that is too long and would have been dropped.\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
+    "text": "Name\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
   },
   {
-    "objectID": "docs/api/utils.trainer.html#functions",
-    "href": "docs/api/utils.trainer.html#functions",
-    "title": "utils.trainer",
+    "objectID": "docs/api/utils.optimizers.adopt.html",
+    "href": "docs/api/utils.optimizers.adopt.html",
+    "title": "utils.optimizers.adopt",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\nfilter_sequences_by_length\nFilter sequences outside valid length range [min_sequence_len, sequence_len].\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.filter_sequences_by_length(\n    sample,\n    sequence_len=2048,\n    min_sequence_len=2,\n    raise_on_drop=False,\n)\nFilter sequences outside valid length range [min_sequence_len, sequence_len].\nDrops samples that are either too short (&lt; min_sequence_len) or too long (&gt; sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\nIf raise_on_drop is set, the code raises a ValueError if a sample is\nencountered that is too long and would have been dropped.\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
+    "text": "utils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\nADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024)\nTaniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka\n\n\n\n\n\nName\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
   },
   {
-    "objectID": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
-    "title": "monkeypatch.btlm_attn_hijack_flash",
+    "objectID": "docs/api/utils.optimizers.adopt.html#functions",
+    "href": "docs/api/utils.optimizers.adopt.html#functions",
+    "title": "utils.optimizers.adopt",
     "section": "",
-    "text": "monkeypatch.btlm_attn_hijack_flash\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model"
+    "text": "Name\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
   },
   {
-    "objectID": "docs/api/core.builders.causal.html",
-    "href": "docs/api/core.builders.causal.html",
-    "title": "core.builders.causal",
+    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html",
+    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html",
+    "title": "prompt_strategies.bradley_terry.llama3",
     "section": "",
-    "text": "core.builders.causal\nBuilder for causal trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
+    "text": "prompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
   },
   {
-    "objectID": "docs/api/core.builders.causal.html#classes",
-    "href": "docs/api/core.builders.causal.html#classes",
-    "title": "core.builders.causal",
+    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
+    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
+    "title": "prompt_strategies.bradley_terry.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
+    "text": "Name\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
   },
   {
-    "objectID": "docs/api/cli.cloud.base.html",
-    "href": "docs/api/cli.cloud.base.html",
-    "title": "cli.cloud.base",
+    "objectID": "docs/api/prompt_strategies.pygmalion.html",
+    "href": "docs/api/prompt_strategies.pygmalion.html",
+    "title": "prompt_strategies.pygmalion",
     "section": "",
-    "text": "cli.cloud.base\nbase class for cloud platforms from cli\n\n\n\n\n\nName\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
+    "text": "prompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
   },
   {
-    "objectID": "docs/api/cli.cloud.base.html#classes",
-    "href": "docs/api/cli.cloud.base.html#classes",
-    "title": "cli.cloud.base",
+    "objectID": "docs/api/prompt_strategies.pygmalion.html#classes",
+    "href": "docs/api/prompt_strategies.pygmalion.html#classes",
+    "title": "prompt_strategies.pygmalion",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
+    "text": "Name\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html",
-    "href": "docs/api/core.trainers.mixins.rng_state_loader.html",
-    "title": "core.trainers.mixins.rng_state_loader",
+    "objectID": "docs/api/utils.schemas.model.html",
+    "href": "docs/api/utils.schemas.model.html",
+    "title": "utils.schemas.model",
     "section": "",
-    "text": "core.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\nSee https://github.com/huggingface/transformers/pull/37162\nTODO: Remove when upstream added PR to release\n\n\n\n\n\nName\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
+    "text": "utils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
-    "href": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
-    "title": "core.trainers.mixins.rng_state_loader",
+    "objectID": "docs/api/utils.schemas.model.html#classes",
+    "href": "docs/api/utils.schemas.model.html#classes",
+    "title": "utils.schemas.model",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
+    "text": "Name\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
   },
   {
-    "objectID": "docs/api/index.html",
-    "href": "docs/api/index.html",
-    "title": "API Reference",
+    "objectID": "docs/api/monkeypatch.lora_kernels.html",
+    "href": "docs/api/monkeypatch.lora_kernels.html",
+    "title": "monkeypatch.lora_kernels",
     "section": "",
-    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing dataset functionality.\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl.\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\n\n\n\n\n\n\nCommand-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.art\nAxolotl ASCII logo utils.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.quantize\nCLI to post-training quantize a model using torchao\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\ncli.utils\nInit for axolotl.cli.utils module.\n\n\ncli.utils.args\nUtilities for axolotl CLI args.\n\n\ncli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\ncli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\ncli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils.train\nUtilities for axolotl train CLI command.\n\n\n\n\n\n\nTraining implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL RL trainers\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers\n\n\n\n\n\n\nFunctionality for loading and patching models, tokenizers, etc.\n\n\n\nloaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\nloaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\nloaders.processor\nProcessor loading functionality for multi-modal models\n\n\nloaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\nloaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\n\n\nloaders.constants\nShared constants for axolotl.loaders module\n\n\n\n\n\n\nMixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\n\nContext managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\n\nPrompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\n\nLow-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes and FP8 integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules.\n\n\n\n\n\n\nRuntime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nmonkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\n\nUtility functions\n\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nUtilities for distributed functionality.\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.streaming\nData handling specific to streaming datasets.\n\n\nutils.data.sft\nData handling specific to SFT.\n\n\nutils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\n\nPydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\n\nThird-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\n\nCommon utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities.\n\n\n\n\n\n\nCustom model implementations\n\n\n\nmodels.mamba.modeling_mamba\n\n\n\n\n\n\n\nData processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\n\n\n\n\n\n\nTraining callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks\n\n\nutils.callbacks.qat\nQAT Callback for HF Causal Trainer"
+    "text": "monkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\n\n\n\nName\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\noriginal_apply_qkv_optional_v\nQKV projection for models where v_proj may be None (e.g. Gemma4 attention_k_eq_v).\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv_optional_v(self, hidden_states)\nQKV projection for models where v_proj may be None (e.g. Gemma4 attention_k_eq_v).\nWhen v_proj is None, key_states are reused as value_states.\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
   },
   {
-    "objectID": "docs/api/index.html#core",
-    "href": "docs/api/index.html#core",
-    "title": "API Reference",
+    "objectID": "docs/api/monkeypatch.lora_kernels.html#classes",
+    "href": "docs/api/monkeypatch.lora_kernels.html#classes",
+    "title": "monkeypatch.lora_kernels",
     "section": "",
-    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing dataset functionality.\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl.\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the"
+    "text": "Name\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching"
   },
   {
-    "objectID": "docs/api/index.html#cli",
-    "href": "docs/api/index.html#cli",
-    "title": "API Reference",
+    "objectID": "docs/api/monkeypatch.lora_kernels.html#functions",
+    "href": "docs/api/monkeypatch.lora_kernels.html#functions",
+    "title": "monkeypatch.lora_kernels",
     "section": "",
-    "text": "Command-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.art\nAxolotl ASCII logo utils.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.quantize\nCLI to post-training quantize a model using torchao\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\ncli.utils\nInit for axolotl.cli.utils module.\n\n\ncli.utils.args\nUtilities for axolotl CLI args.\n\n\ncli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\ncli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\ncli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils.train\nUtilities for axolotl train CLI command."
+    "text": "Name\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\noriginal_apply_qkv_optional_v\nQKV projection for models where v_proj may be None (e.g. Gemma4 attention_k_eq_v).\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv_optional_v(self, hidden_states)\nQKV projection for models where v_proj may be None (e.g. Gemma4 attention_k_eq_v).\nWhen v_proj is None, key_states are reused as value_states.\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
   },
   {
-    "objectID": "docs/api/index.html#trainers",
-    "href": "docs/api/index.html#trainers",
-    "title": "API Reference",
+    "objectID": "docs/api/utils.schemas.utils.html",
+    "href": "docs/api/utils.schemas.utils.html",
+    "title": "utils.schemas.utils",
     "section": "",
-    "text": "Training implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL RL trainers\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers"
+    "text": "utils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
   },
   {
-    "objectID": "docs/api/index.html#model-loading",
-    "href": "docs/api/index.html#model-loading",
-    "title": "API Reference",
+    "objectID": "docs/api/utils.schemas.utils.html#functions",
+    "href": "docs/api/utils.schemas.utils.html#functions",
+    "title": "utils.schemas.utils",
     "section": "",
-    "text": "Functionality for loading and patching models, tokenizers, etc.\n\n\n\nloaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\nloaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\nloaders.processor\nProcessor loading functionality for multi-modal models\n\n\nloaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\nloaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\n\n\nloaders.constants\nShared constants for axolotl.loaders module"
+    "text": "Name\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
   },
   {
-    "objectID": "docs/api/index.html#mixins",
-    "href": "docs/api/index.html#mixins",
-    "title": "API Reference",
+    "objectID": "docs/api/cli.checks.html",
+    "href": "docs/api/cli.checks.html",
+    "title": "cli.checks",
     "section": "",
-    "text": "Mixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin"
+    "text": "cli.checks\nVarious checks for Axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
   },
   {
-    "objectID": "docs/api/index.html#context-managers",
-    "href": "docs/api/index.html#context-managers",
-    "title": "API Reference",
+    "objectID": "docs/api/cli.checks.html#functions",
+    "href": "docs/api/cli.checks.html#functions",
+    "title": "cli.checks",
     "section": "",
-    "text": "Context managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities"
+    "text": "Name\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
   },
   {
-    "objectID": "docs/api/index.html#prompt-strategies",
-    "href": "docs/api/index.html#prompt-strategies",
-    "title": "API Reference",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
+    "title": "monkeypatch.gradient_checkpointing.offload_disk",
     "section": "",
-    "text": "Prompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template"
+    "text": "monkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\nName\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
   },
   {
-    "objectID": "docs/api/index.html#kernels",
-    "href": "docs/api/index.html#kernels",
-    "title": "API Reference",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
+    "title": "monkeypatch.gradient_checkpointing.offload_disk",
     "section": "",
-    "text": "Low-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes and FP8 integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules."
+    "text": "Name\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
   },
   {
-    "objectID": "docs/api/index.html#monkey-patches",
-    "href": "docs/api/index.html#monkey-patches",
-    "title": "API Reference",
+    "objectID": "docs/api/prompt_strategies.dpo.chatml.html",
+    "href": "docs/api/prompt_strategies.dpo.chatml.html",
+    "title": "prompt_strategies.dpo.chatml",
     "section": "",
-    "text": "Runtime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nmonkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching"
+    "text": "prompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/index.html#utils",
-    "href": "docs/api/index.html#utils",
-    "title": "API Reference",
+    "objectID": "docs/api/prompt_strategies.dpo.chatml.html#functions",
+    "href": "docs/api/prompt_strategies.dpo.chatml.html#functions",
+    "title": "prompt_strategies.dpo.chatml",
     "section": "",
-    "text": "Utility functions\n\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nUtilities for distributed functionality.\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.streaming\nData handling specific to streaming datasets.\n\n\nutils.data.sft\nData handling specific to SFT.\n\n\nutils.quantization\nUtilities for quantization including QAT and PTQ using torchao."
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/index.html#schemas",
-    "href": "docs/api/index.html#schemas",
-    "title": "API Reference",
+    "objectID": "docs/api/utils.chat_templates.html",
+    "href": "docs/api/utils.chat_templates.html",
+    "title": "utils.chat_templates",
     "section": "",
-    "text": "Pydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models"
+    "text": "utils.chat_templates\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\nThese templates are used for formatting messages in a conversation."
   },
   {
-    "objectID": "docs/api/index.html#integrations",
-    "href": "docs/api/index.html#integrations",
-    "title": "API Reference",
+    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html",
+    "href": "docs/api/prompt_strategies.orpo.chat_template.html",
+    "title": "prompt_strategies.orpo.chat_template",
     "section": "",
-    "text": "Third-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments."
+    "text": "prompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\n\n\n\nName\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
   },
   {
-    "objectID": "docs/api/index.html#common",
-    "href": "docs/api/index.html#common",
-    "title": "API Reference",
+    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
+    "href": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
+    "title": "prompt_strategies.orpo.chat_template",
     "section": "",
-    "text": "Common utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities."
+    "text": "Name\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels"
   },
   {
-    "objectID": "docs/api/index.html#models",
-    "href": "docs/api/index.html#models",
-    "title": "API Reference",
+    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
+    "href": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
+    "title": "prompt_strategies.orpo.chat_template",
     "section": "",
-    "text": "Custom model implementations\n\n\n\nmodels.mamba.modeling_mamba"
+    "text": "Name\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
   },
   {
-    "objectID": "docs/api/index.html#data-processing",
-    "href": "docs/api/index.html#data-processing",
-    "title": "API Reference",
+    "objectID": "docs/api/monkeypatch.utils.html",
+    "href": "docs/api/monkeypatch.utils.html",
+    "title": "monkeypatch.utils",
     "section": "",
-    "text": "Data processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences"
+    "text": "monkeypatch.utils\nShared utils for the monkeypatches\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids"
   },
   {
-    "objectID": "docs/api/index.html#callbacks",
-    "href": "docs/api/index.html#callbacks",
-    "title": "API Reference",
+    "objectID": "docs/api/monkeypatch.utils.html#functions",
+    "href": "docs/api/monkeypatch.utils.html#functions",
+    "title": "monkeypatch.utils",
     "section": "",
-    "text": "Training callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks\n\n\nutils.callbacks.qat\nQAT Callback for HF Causal Trainer"
+    "text": "Name\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids"
   },
   {
-    "objectID": "docs/api/prompt_strategies.base.html",
-    "href": "docs/api/prompt_strategies.base.html",
-    "title": "prompt_strategies.base",
+    "objectID": "docs/api/utils.callbacks.profiler.html",
+    "href": "docs/api/utils.callbacks.profiler.html",
+    "title": "utils.callbacks.profiler",
     "section": "",
-    "text": "prompt_strategies.base\nprompt_strategies.base\nmodule for base dataset transform strategies"
+    "text": "utils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\n\n\n\nName\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\nAlso runs torch.profiler to produce a Chrome trace for timing analysis."
   },
   {
-    "objectID": "docs/api/monkeypatch.unsloth_.html",
-    "href": "docs/api/monkeypatch.unsloth_.html",
-    "title": "monkeypatch.unsloth_",
+    "objectID": "docs/api/utils.callbacks.profiler.html#classes",
+    "href": "docs/api/utils.callbacks.profiler.html#classes",
+    "title": "utils.callbacks.profiler",
     "section": "",
-    "text": "monkeypatch.unsloth_\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations"
+    "text": "Name\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\nAlso runs torch.profiler to produce a Chrome trace for timing analysis."
   },
   {
     "objectID": "docs/api/utils.schemas.integrations.html",
@@ -6030,7 +6023,7 @@
     "href": "docs/config-reference.html",
     "title": "Config Reference",
     "section": "",
-    "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n# Reinitialize model weights randomly instead of loading pretrained weights\nreinit_weights: bool | None\n\n# module to custom trainer class to use for training\ntrainer_cls: str | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo', 'ebft'\nrl: RLType | None\n\ntrl: TRLConfig | None\n  # For TRLConfig:\n  # Beta parameter for the RL training. Same as `rl_beta`. Use\n  beta: float | None\n  # Maximum length of the completion for RL training.\n  max_completion_length: int | None\n\n  # Whether to use VLLM for RL training.\n  use_vllm: bool = False\n  # VLLM mode to use, one of 'server' or 'colocate'\n  vllm_mode: Literal['server', 'colocate'] | None\n  # Host of the vLLM server to connect to.\n  vllm_server_host: str | None = 0.0.0.0\n  # Port of the vLLM server to connect to.\n  vllm_server_port: int | None = 8000\n  # Total timeout (in seconds) to wait for the vLLM server to respond.\n  vllm_server_timeout: int | None\n  # Regex for vLLM guided decoding.\n  vllm_guided_decoding_regex: str | None\n\n  # List of reward functions to load. Paths must be importable from current dir.\n  reward_funcs: list[str] | None\n  # List of reward weights for the reward functions.\n  reward_weights: list[float] | None\n  # Batch size for generation. Controls how many unique prompts are generated per step.\n  # Should be num_generations * data_parallel_size for full DP utilization.\n  generation_batch_size: int | None\n  # Number of generations to sample.\n  num_generations: int | None\n  # Whether to log completions.\n  log_completions: bool | None = False\n  # Number of completions to print when log_completions is True.\n  num_completions_to_print: int | None\n  # Controls whether importance sampling ratios are computed at the `'token'` or\n  # `'sequence'` level. For GSPO, use `sequence`, default is None which corresponds to\n  # the original GRPO paper.\n  importance_sampling_level: Literal['sequence', 'token'] | None\n\n  # Whether to sync the reference model.\n  sync_ref_model: bool | None = False\n  # Mixup alpha for the reference model.\n  ref_model_mixup_alpha: float | None = 0.9\n  # Sync steps for the reference model.\n  ref_model_sync_steps: int | None = 64\n  # Whether to scale rewards by their standard deviation.\n  scale_rewards: bool = True\n\n  # Sampling temperature for the GRPO policy.\n  temperature: float | None\n  # Top-p sampling probability for the generation policy.\n  top_p: float | None\n  # Top-k sampling for the generation policy.\n  top_k: int | None\n  # Minimum probability for the generation policy.\n  min_p: float | None\n  # Penalty for tokens that appear in prompt and generated text.\n  repetition_penalty: float | None\n  # Additional generation parameters passed to vLLM SamplingParams. Useful for\n  # stop_token_ids, seed, frequency_penalty, etc.\n  generation_kwargs: dict[str, Any] | None\n  # Additional kwargs for the chat template. E.g., {enable_thinking: false} for Qwen3.5\n  # models.\n  chat_template_kwargs: dict[str, Any] | None\n  # Number of iterations per batch (μ) for GRPO.\n  num_iterations: int | None\n  # Epsilon value for clipping in the GRPO algorithm.\n  epsilon: float | None\n  # Upper-bound epsilon value for clipping in the GRPO algorithm.\n  epsilon_high: float | None\n  # Whether to use Liger loss for GRPO.\n  use_liger_loss: bool | None\n  # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n  loss_type: str | None\n  # Whether to exclude truncated completions from loss calculation.\n  mask_truncated_completions: bool = False\n  # Enable sleep mode for vLLM to offload VRAM when idle\n  vllm_enable_sleep_mode: bool | None\n  # Path to custom rollout function. Must be importable from current dir.\n  rollout_func: str | None\n  # Multi-objective reward aggregation strategy. 'sum_then_normalize' (GRPO default):\n  # weights and sums rewards first, then normalizes. 'normalize_then_sum' (GDPO):\n  # normalizes each reward independently, then sums.\n  multi_objective_aggregation: Literal['sum_then_normalize', 'normalize_then_sum'] | None\n\n  # Use the GRPODataProducer protocol for online data generation.\n  use_data_producer: bool = False\n  # Generate rollouts in a background thread while training on the previous rollout.\n  async_prefetch: bool = False\n  # Number of rollouts to prefetch ahead of training.\n  prefetch_depth: int | None\n  # Sync model weights to vLLM every N optimizer steps (async mode only).\n  vllm_sync_interval: int | None\n  # Score prompt groups incrementally instead of the full batch at once.\n  streaming_partial_batch: bool | None\n  # Minimum prompt groups to score per streaming chunk.\n  streaming_min_groups: int | None\n  # Apply IS correction for distribution mismatch between vLLM and training model.\n  vllm_importance_sampling_correction: bool | None\n  # IS mode: token_truncate, token_mask, sequence_truncate, or sequence_mask.\n  vllm_importance_sampling_mode: Literal['token_truncate', 'token_mask', 'sequence_truncate', 'sequence_mask'] | None\n  # Cap C for IS ratio clipping/masking.\n  vllm_importance_sampling_cap: float | None\n  # KL threshold for off-policy sequence masking (OPSM). None = disabled.\n  off_policy_mask_threshold: float | None\n  # Apply IS correction to KL divergence term.\n  use_bias_correction_kl: bool | None\n\n  # Number of persistent subprocess workers for parallel reward computation. Each worker\n  # has its own main thread so signal.alarm() (used by math_verify) works correctly.\n  # Work is sharded across workers by prompt groups. Only used with\n  # use_data_producer=True and non-nn.Module reward functions.\n  reward_num_workers: int = 1\n  # [Experimental, disabled by default] Size of the replay buffer for storing high-\n  # signal rollout groups. When &gt; 0, groups with reward variance are cached and used to\n  # replace zero-signal groups (where all rewards are identical). Set to 0 to disable.\n  # Only used with use_data_producer=True.\n  replay_buffer_size: int = 0\n  # When True (default), recompute old_per_token_logps for replayed groups using the\n  # current training model. This fixes the importance sampling mismatch that occurs when\n  # replaying stale data. Only relevant when replay_buffer_size &gt; 0.\n  replay_recompute_logps: bool = True\n  # Fraction of total training steps after which deferred re-rolling begins. Zero-signal\n  # prompts (where all rewards in a group are identical) are buffered and re-injected\n  # into later batches when the model is more likely to solve them. Set to 1.0 to\n  # disable. Only used with use_data_producer=True.\n  reroll_start_fraction: float = 1.0\n  # Maximum number of prompt groups to replace with re-roll candidates per batch. Higher\n  # values increase data utilization but reduce prompt diversity. Only used with\n  # use_data_producer=True.\n  reroll_max_groups: int = 1\n  # When True, skip gradient computation for micro-batches where all advantages are zero\n  # (no learning signal). This avoids the forward/backward pass entirely when no\n  # learning signal is present. The step is logged with skipped_zero_adv_batches=1 for\n  # monitoring.\n  skip_zero_advantage_batches: bool = True\n  # Sync LoRA adapter to vLLM via filesystem instead of merging + NCCL broadcast. Auto-\n  # selects vllm_serve_lora serve module. Syncs only LoRA adapter weights vs full merged\n  # model.\n  vllm_lora_sync: bool = False\n\nvllm: VllmConfig | None\n  # For VllmConfig:\n  # Device to use for VLLM\n  device: str | None = auto\n  # Tensor parallel size for VLLM\n  tensor_parallel_size: int | None\n  # Data parallel size for VLLM\n  data_parallel_size: int | None\n  # GPU memory utilization for VLLM\n  gpu_memory_utilization: float | None = 0.9\n  # Data type for VLLM\n  dtype: str | None = auto\n  # Maximum length of the model context for VLLM\n  max_model_len: int | None\n  # Enable prefix caching for VLLM\n  enable_prefix_caching: bool | None\n  # Host for the vLLM server to start on\n  host: str | None = 0.0.0.0\n  # Port of the vLLM server to start on\n  port: int | None = 8000\n\n  # Enable reasoning for VLLM\n  enable_reasoning: bool | None\n  # Reasoning parser for VLLM\n  reasoning_parser: str | None\n  # Disable CUDA graph capture in vLLM. Required for models with causal_conv1d (e.g.,\n  # Qwen3.5 hybrid linear attention).\n  enforce_eager: bool | None\n  # Python module for vLLM serve script. Set to 'axolotl.scripts.vllm_serve_lora' for\n  # native LoRA support, or leave None for default TRL serve.\n  serve_module: str | None\n  # vLLM worker extension class for weight synchronization. Defaults to\n  # 'trl.scripts.vllm_serve.WeightSyncWorkerExtension'.\n  worker_extension_cls: str | None\n\n# Configuration for Energy-Based Fine-Tuning (EBFT)\nebft: EBFTConfig | None\n  # For EBFTConfig:\n  # Fractional layer depths for feature extraction (e.g., [0.25, 0.5, 0.75])\n  feature_layers: list[float] = [0.25, 0.5, 0.75]\n  # Embedding method: 'last_token', 'mean_pooling', 'completion_mean', or 'concat'\n  embed_method: Literal['last_token', 'mean_pooling', 'completion_mean', 'concat'] = last_token\n  # Apply SVD whitening to feature embeddings\n  use_whitening: bool = False\n  # Coefficient for alignment reward (cosine similarity with ground truth)\n  alignment_coef: float = 1.0\n  # Coefficient for diversity penalty (pairwise similarity between samples)\n  diversity_coef: float = 1.0\n  # Cross-entropy loss coefficient on ground-truth tokens\n  ce_coef: float = 0.0\n  # Set per-batch max_tokens based on ground-truth length\n  adaptive_max_tokens: bool = True\n  # Multiplier for ground-truth token count when computing adaptive max_tokens\n  gt_length_multiplier: float = 1.5\n\n  # EBFT mode: 'structured' (QA with vLLM) or 'strided' (unstructured text)\n  mode: Literal['structured', 'strided'] = structured\n  # Stride between anchor points (tokens)\n  stride: int = 8\n  # Context window size per block\n  context_length: int = 8\n  # Tokens to generate per block\n  generate_max_len: int = 8\n  # Independent rollouts per document\n  n_samples_per_prompt: int = 4\n  # Sampling temperature for strided generation\n  temperature: float = 0.6\n  # Top-p nucleus sampling threshold\n  top_p: float = 1.0\n  # RL policy gradient loss coefficient\n  rl_coef: float = 1.0\n  # Advantage estimator: 'rloo', 'group_norm', 'reinforce'\n  advantage_estimator: Literal['rloo', 'group_norm', 'reinforce'] = rloo\n  # Minimum tokens into completion before placing anchors. Skips anchors too close to\n  # the prompt boundary where features are dominated by prompt context.\n  min_completion_prefix: int = 0\n\nqat: QATConfig | None\n  # For QATConfig:\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Quantize embedding\n  quantize_embedding: bool | None = False\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n  # The number of steps to apply fake quantization after\n  fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n  # For PTQConfig:\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Whether to quantize the embedding layer.\n  quantize_embedding: bool | None\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n\n# Configuration for dynamic checkpointing (trigger by file or signal). Set 'enabled:\n# true' to activate this feature.\ndynamic_checkpoint: DynamicCheckpointConfig | None\n  # For DynamicCheckpointConfig:\n  # Enable dynamic checkpoint triggering during training. Create a file\n  # 'axolotl_checkpoint.save' in the configured `output_dir` to trigger.\n  enabled: bool = False\n  # Check for trigger file every N steps (reduces I/O overhead). Default: 100\n  check_interval: int = 10\n  # Custom trigger filename (optional). If not specified, defaults to\n  # 'axolotl_checkpoint.save'. Specify a filename (not a full path) to override the\n  # default.\n  trigger_file_path: str = \n\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\n# Coefficient to incentivize the reward model to output mean-zero rewards (proposed by\n# https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.\ncenter_rewards_coefficient: float | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_label_smoothing: float | None\n# Precompute reference model log probabilities for DPO\nprecompute_ref_log_probs: bool | None\n\n# Whether to use Liger kernel for DPO loss.\ndpo_use_liger_kernel: bool | None\n\ndpo_padding_free: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n  # For SyntheticDataset:\n  path: Literal['synthetic'] = synthetic\n  type: Literal['_synthetic'] = _synthetic\n  # Number of rows to generate\n  length: int = 1000\n  # Sequence length per row (defaults to sequence_len from config)\n  sequence_length: int | None\n  # Minimum token ID for generation\n  min_input_id: int = 100\n  # Maximum token ID for generation (defaults to tokenizer vocab_size)\n  max_input_id: int | None\n  # Random seed for reproducibility\n  seed: int | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n  # For SyntheticDataset:\n  path: Literal['synthetic'] = synthetic\n  type: Literal['_synthetic'] = _synthetic\n  # Number of rows to generate\n  length: int = 1000\n  # Sequence length per row (defaults to sequence_len from config)\n  sequence_length: int | None\n  # Minimum token ID for generation\n  min_input_id: int = 100\n  # Maximum token ID for generation (defaults to tokenizer vocab_size)\n  max_input_id: int | None\n  # Random seed for reproducibility\n  seed: int | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# If true, each dataset in `datasets` will be shuffled before merging. This allows\n# curriculum learning strategies to be applied at the dataset level. Default is false.\nshuffle_before_merging_datasets: bool | None = False\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n# Number of shards to save the prepared dataset\nnum_dataset_shards_to_save: int | None\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n  # For PretrainingDataset:\n  name: str | None\n  path: str | None\n  split: str | None = train\n  text_column: str | None = text\n  type: str | None = pretrain\n  trust_remote_code: bool | None = False\n  data_files: str | None\n  skip: int | None\n\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_processes: int | None\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_num_proc: int | None\n\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\n# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before\n# evaluations. Default is 0 (disabled).\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require &gt;=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\n# Enable FP8 mixed precision training using TorchAO. Best used in combination with\n# torch.compile.\nfp8: bool | None\n# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training\n# speed by 10-15% when FSDP is enabled.\nfp8_enable_fsdp_float8_all_gather: bool | None\n# No AMP (automatic mixed precision) - require &gt;=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# bool to use CUDA tf32 or 'auto' for automatic detection - require &gt;=ampere\ntf32: Literal['auto'] | bool | None = auto\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.\nactivation_offloading: Literal['legacy', 'disk'] | bool | None = False\n# Offload model layer parameters to CPU during forward, prefetch back during backward.\nlayer_offloading: bool | None = False\n\n# Freeze multimodal encoder parameters (vision, audio, etc.) for text-only training of\n# multimodal models. When True, parameters belonging to vision towers, audio towers,\n# multimodal projectors, and similar non-language modules are frozen\n# (requires_grad=False). This allows DDP training without\n# ddp_find_unused_parameters=True.\nfreeze_mm_modules: bool | None\n\n# List of regex patterns for parameter names to keep unfrozen. All other parameters will\n# be frozen via requires_grad=False. Note: range-based patterns (e.g.\n# embed_tokens.weight$[:32000]) use gradient zeroing rather than a true freeze, so\n# weight decay will still apply to the frozen portion and optimizer states are allocated\n# for the full parameter.\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;\n# 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to\n# 'drop' for backward compatibility.\nexcess_length_strategy: Literal['drop', 'truncate', 'raise'] | None\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int | None\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (&lt;%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to\n# True if `sample_packing` enabled\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use streaming mode for loading datasets\nstreaming: bool | None\n# Buffer size for multipack streaming datasets\nstreaming_multipack_buffer_size: int | None = 10000\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n# Whether to use SageAttention https://github.com/thu-ml/SageAttention\nsage_attention: bool | None\n\neager_attention: bool | None\n\n# Specify a custom attention implementation, used mostly for kernels.\nattn_implementation: str | None\n\n# Use hybrid attention for Gemma 4: flash_attention_2 for sliding window layers and sdpa\n# for global (full_attention) layers. Global layers have head_dim=512 which exceeds\n# flash attention's supported size.\ngemma4_hybrid_attn_impl: bool | None\n\n# Which experts implementation to use for MoE models,\nexperts_implementation: str | None\n\n# Quantize MoE expert weights on load to reduce VRAM. Requires adapter (lora/qlora) with\n# load_in_4bit or load_in_8bit. Requires CUDA (not compatible with ROCm or other\n# backends). Note: total parameter count may be reported incorrectly when enabled\n# (trainable param count is correct).\nquantize_moe_experts: bool = False\n\n# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399\nscaling_softmax: bool | None\n# Scaling factor for SSMax attention. Default is 0.43\nscaling_softmax_factor: float | None\n# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better\n# length generalization.\nscaling_softmax_bias: float | None\n\nunsloth_cross_entropy_loss: bool | None\nunsloth_lora_mlp: bool | None\nunsloth_lora_qkv: bool | None\nunsloth_lora_o: bool | None\nunsloth_rms_norm: bool | None\nunsloth_rope: bool | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n# Apply custom LoRA autograd function for embedding layers. See:\n# https://docs.axolotl.ai/docs/lora_optims.html\nlora_embedding_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n# Enable Entropy-Aware Focal Training loss (EAFT)\nuse_eaft: bool | None\n# Exponent for entropy weighting in EAFT (default: 1.0)\neaft_alpha: float | None = 1.0\n# Number of top logits for entropy approximation (default: 20)\neaft_k: int | None = 20\n\n# Whether to use ALST tiled mlp for memory efficient long context\ntiled_mlp: bool | None\n\n# Number of shards to use for ALST tiled mlp. If unset, it will be set based on\n# seqlen/hidden_size\ntiled_mlp_num_shards: int | None\n\n# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on\n# llama.\ntiled_mlp_use_original_mlp: bool | None = True\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# Whether to use deepcompile for faster training with deepspeed\ndeepcompile: bool | None\n# FSDP configuration\nfsdp: list[str] | None\n\n# FSDP configuration options\nfsdp_config: FSDPConfig | None\n  # For FSDPConfig:\n  # FSDP version\n  fsdp_version: int | None\n  # Enable activation checkpointing to reduce memory usage during forward passes\n  activation_checkpointing: bool | None\n  # Offload parameters to CPU to reduce GPU memory usage\n  offload_params: bool | None\n  # Synchronize module states across all processes\n  sync_module_states: bool | None\n  # Enable CPU RAM efficient loading to reduce memory usage during model loading\n  cpu_ram_efficient_loading: bool | None\n  # Disabling this enables swap memory usage for resource-constrained setups when\n  # offload_params is enabled.\n  cpu_offload_pin_memory: bool | None\n  # Use original parameters instead of flattened parameters\n  use_orig_params: bool | None\n\n  # Type of state dict to use for saving/loading checkpoints\n  state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n  # Final state dict type to use after training completion\n  final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n  # Policy for automatically wrapping modules with FSDP\n  auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None\n  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')\n  transformer_layer_cls_to_wrap: str | None\n\n  # Reshard parameters after forward pass to save memory\n  reshard_after_forward: bool | None\n  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')\n  mixed_precision_policy: str | None\n\n# FSDP version\nfsdp_version: int | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Number of devices to shard across. If not set, will use all available devices.\ndp_shard_size: int | None\n# Number of devices to replicate across.\ndp_replicate_size: int | None\n# Deprecated: use `context_parallel_size` instead\nsequence_parallel_degree: int | None\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\ncontext_parallel_size: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.\ntensor_parallel_size: int | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n  # For SpecialTokensConfig:\n  bos_token: str | None\n  eos_token: str | None\n  pad_token: str | None\n  unk_token: str | None\n  additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch&gt;=2.6.0\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Whether to checkpoint a model after the first step of training. Defaults to False.\nsave_first_step: bool | None\n\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# Which step to start the profiler at. Useful for only capturing a few steps mid-run.\nprofiler_steps_start: int | None = 0\n# bool of whether to report tokens per second at the end of training. This is not\n# supported with pre-training datasets.\ninclude_tokens_per_second: bool | None\n# bool of whether to report tokens per second per-gpu during training by measuring\n# throughput of non-padding tokens.\ninclude_tkps: bool | None = True\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template or path to jinja file for chat template. This will be only used\n# if chat_template is set to `jinja` or `null` (in which case chat_template is\n# automatically set to `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\n# Token index or indices to adjust embedding weights to the mean of the other tokens.\n# This is useful when the model has untrained embeddings.\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n# Enable sample generation during training for monitoring\ngenerate_samples: bool | None = False\n# Number of samples to generate at each interval\nnum_generation_samples: int | None = 3\n# Maximum new tokens to generate per sample\ngeneration_max_new_tokens: int | None = 50\n# Temperature for sample generation (0.0 = greedy)\ngeneration_temperature: float | None = 0.7\n# Nucleus sampling parameter for generation\ngeneration_top_p: float | None\n# Top-k sampling parameter for generation\ngeneration_top_k: int | None\n# Ratio of input to use as prompt (0.0-1.0)\ngeneration_prompt_ratio: float | None = 0.5\n# Whether to use sampling (vs greedy decoding)\ngeneration_do_sample: bool | None = True\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\n# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to\n# AutoConfig.\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Whether to save jinja files for tokenizer, transformers default is True\ntokenizer_save_jinja_files: bool | None = True\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Don't move the model to the device before sharding. Set to `false` to revert to legacy\n# behavior.\nexperimental_skip_move_to_device: bool | None = True\n\n# Use custom kernels, e.g. MegaBlocks.\nuse_kernels: bool | None\n\n# Model loading quantization config\nmodel_quantization_config: Literal['Mxfp4Config', 'FineGrainedFP8Config'] | None\n# kwargs for model quantization config\nmodel_quantization_config_kwargs: dict[str, Any] | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# branch/revision to push to on hub (default: main)\nhub_revision: str | None\n# Whether to save the model using safetensors format. Defaults to True.\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all\n# parameters in original model\nadapter: Literal['lora', 'qlora', 'llama-adapter'] | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\nlora_target_parameters: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n  # For PeftConfig:\n  # Configuration options for loftq initialization for LoRA\n  loftq_config: LoftQConfig | None\n    # For LoftQConfig:\n    # typically 4 bits\n    loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict\n# mapping an embedding layer name to its trainable token indices. See\n# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-\n# tokens-alongside-lora\npeft_trainable_token_indices: list[int] | dict[str, list[int]] | None\n# Whether to tie adapter weights for tied model weights. See\n# https://github.com/huggingface/peft/issues/2864\npeft_ensure_weight_tying: bool | None\n# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.\npeft_autocast_adapter_dtype: bool | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n# Method to use for LoRA merging. 'memory_efficient' (default) processes shards\n# individually to reduce memory usage, 'legacy' loads the full model into memory.\nmerge_method: Literal['legacy', 'memory_efficient'] | None = memory_efficient\n\n# Whether to use ReLoRA. Use with jagged_restart_*steps options.\nrelora: bool | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# how often to reset for jagged restarts\njagged_restart_steps: int | None\n# how many warmup steps to take after reset for jagged restarts\njagged_restart_warmup_steps: int | None\n# how many anneal steps to take before reset for jagged restarts\njagged_restart_anneal_steps: int | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n  # For LrGroup:\n  name: str (required)\n  modules: list[str] (required)\n  lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n\n# Dion Optimizer learning rate\ndion_lr: float | None\n# Dion Optimizer momentum\ndion_momentum: float | None\n# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank\n# dimension.\ndion_rank_fraction: float | None = 1.0\n# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may\n# be useful to ensure even sharding.\ndion_rank_multiple_of: int | None = 1\n\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\nuse_trackio: bool | None\n# Your trackio project name\ntrackio_project_name: str | None\n# Set the name of your trackio run\ntrackio_run_name: str | None\n# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)\ntrackio_space_id: str | None\n\n# Enable OpenTelemetry metrics collection and Prometheus export\nuse_otel_metrics: bool | None = False\n# Host to bind the OpenTelemetry metrics server to\notel_metrics_host: str | None = localhost\n# Port for the Prometheus metrics HTTP server\notel_metrics_port: int | None = 8000\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None\neval_table_size: int | None\neval_max_new_tokens: int | None\ndpo_use_logits_to_keep: bool | None\ndpo_generate_during_eval: bool | None\ndpo_norm_loss: bool | None\nrpo_alpha: float | None",
+    "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n# Reinitialize model weights randomly instead of loading pretrained weights\nreinit_weights: bool | None\n\n# module to custom trainer class to use for training\ntrainer_cls: str | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo', 'ebft'\nrl: RLType | None\n\ntrl: TRLConfig | None\n  # For TRLConfig:\n  # Beta parameter for the RL training. Same as `rl_beta`. Use\n  beta: float | None\n  # Maximum length of the completion for RL training.\n  max_completion_length: int | None\n\n  # Whether to use VLLM for RL training.\n  use_vllm: bool = False\n  # VLLM mode to use, one of 'server' or 'colocate'\n  vllm_mode: Literal['server', 'colocate'] | None\n  # Host of the vLLM server to connect to.\n  vllm_server_host: str | None = 0.0.0.0\n  # Port of the vLLM server to connect to.\n  vllm_server_port: int | None = 8000\n  # Total timeout (in seconds) to wait for the vLLM server to respond.\n  vllm_server_timeout: int | None\n  # Regex for vLLM guided decoding.\n  vllm_guided_decoding_regex: str | None\n\n  # List of reward functions to load. Paths must be importable from current dir.\n  reward_funcs: list[str] | None\n  # List of reward weights for the reward functions.\n  reward_weights: list[float] | None\n  # Batch size for generation. Controls how many unique prompts are generated per step.\n  # Should be num_generations * data_parallel_size for full DP utilization.\n  generation_batch_size: int | None\n  # Number of generations to sample.\n  num_generations: int | None\n  # Whether to log completions.\n  log_completions: bool | None = False\n  # Number of completions to print when log_completions is True.\n  num_completions_to_print: int | None\n  # Controls whether importance sampling ratios are computed at the `'token'` or\n  # `'sequence'` level. For GSPO, use `sequence`, default is None which corresponds to\n  # the original GRPO paper.\n  importance_sampling_level: Literal['sequence', 'token'] | None\n\n  # Whether to sync the reference model.\n  sync_ref_model: bool | None = False\n  # Mixup alpha for the reference model.\n  ref_model_mixup_alpha: float | None = 0.9\n  # Sync steps for the reference model.\n  ref_model_sync_steps: int | None = 64\n  # Whether to scale rewards by their standard deviation.\n  scale_rewards: bool = True\n\n  # Sampling temperature for the GRPO policy.\n  temperature: float | None\n  # Top-p sampling probability for the generation policy.\n  top_p: float | None\n  # Top-k sampling for the generation policy.\n  top_k: int | None\n  # Minimum probability for the generation policy.\n  min_p: float | None\n  # Penalty for tokens that appear in prompt and generated text.\n  repetition_penalty: float | None\n  # Additional generation parameters passed to vLLM SamplingParams. Useful for\n  # stop_token_ids, seed, frequency_penalty, etc.\n  generation_kwargs: dict[str, Any] | None\n  # Additional kwargs for the chat template. E.g., {enable_thinking: false} for Qwen3.5\n  # models.\n  chat_template_kwargs: dict[str, Any] | None\n  # Number of iterations per batch (μ) for GRPO.\n  num_iterations: int | None\n  # Epsilon value for clipping in the GRPO algorithm.\n  epsilon: float | None\n  # Upper-bound epsilon value for clipping in the GRPO algorithm.\n  epsilon_high: float | None\n  # Whether to use Liger loss for GRPO.\n  use_liger_loss: bool | None\n  # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n  loss_type: str | None\n  # Whether to exclude truncated completions from loss calculation.\n  mask_truncated_completions: bool = False\n  # Enable sleep mode for vLLM to offload VRAM when idle\n  vllm_enable_sleep_mode: bool | None\n  # Path to custom rollout function. Must be importable from current dir.\n  rollout_func: str | None\n  # Multi-objective reward aggregation strategy. 'sum_then_normalize' (GRPO default):\n  # weights and sums rewards first, then normalizes. 'normalize_then_sum' (GDPO):\n  # normalizes each reward independently, then sums.\n  multi_objective_aggregation: Literal['sum_then_normalize', 'normalize_then_sum'] | None\n\n  # Use the GRPODataProducer protocol for online data generation.\n  use_data_producer: bool = False\n  # Generate rollouts in a background thread while training on the previous rollout.\n  async_prefetch: bool = False\n  # Number of rollouts to prefetch ahead of training.\n  prefetch_depth: int | None\n  # Sync model weights to vLLM every N optimizer steps (async mode only).\n  vllm_sync_interval: int | None\n  # Score prompt groups incrementally instead of the full batch at once.\n  streaming_partial_batch: bool | None\n  # Minimum prompt groups to score per streaming chunk.\n  streaming_min_groups: int | None\n  # Apply IS correction for distribution mismatch between vLLM and training model.\n  vllm_importance_sampling_correction: bool | None\n  # IS mode: token_truncate, token_mask, sequence_truncate, or sequence_mask.\n  vllm_importance_sampling_mode: Literal['token_truncate', 'token_mask', 'sequence_truncate', 'sequence_mask'] | None\n  # Cap C for IS ratio clipping/masking.\n  vllm_importance_sampling_cap: float | None\n  # KL threshold for off-policy sequence masking (OPSM). None = disabled.\n  off_policy_mask_threshold: float | None\n  # Apply IS correction to KL divergence term.\n  use_bias_correction_kl: bool | None\n\n  # Number of persistent subprocess workers for parallel reward computation. Each worker\n  # has its own main thread so signal.alarm() (used by math_verify) works correctly.\n  # Work is sharded across workers by prompt groups. Only used with\n  # use_data_producer=True and non-nn.Module reward functions.\n  reward_num_workers: int = 1\n  # [Experimental, disabled by default] Size of the replay buffer for storing high-\n  # signal rollout groups. When &gt; 0, groups with reward variance are cached and used to\n  # replace zero-signal groups (where all rewards are identical). Set to 0 to disable.\n  # Only used with use_data_producer=True.\n  replay_buffer_size: int = 0\n  # When True (default), recompute old_per_token_logps for replayed groups using the\n  # current training model. This fixes the importance sampling mismatch that occurs when\n  # replaying stale data. Only relevant when replay_buffer_size &gt; 0.\n  replay_recompute_logps: bool = True\n  # Fraction of total training steps after which deferred re-rolling begins. Zero-signal\n  # prompts (where all rewards in a group are identical) are buffered and re-injected\n  # into later batches when the model is more likely to solve them. Set to 1.0 to\n  # disable. Only used with use_data_producer=True.\n  reroll_start_fraction: float = 1.0\n  # Maximum number of prompt groups to replace with re-roll candidates per batch. Higher\n  # values increase data utilization but reduce prompt diversity. Only used with\n  # use_data_producer=True.\n  reroll_max_groups: int = 1\n  # When True, skip gradient computation for micro-batches where all advantages are zero\n  # (no learning signal). This avoids the forward/backward pass entirely when no\n  # learning signal is present. The step is logged with skipped_zero_adv_batches=1 for\n  # monitoring.\n  skip_zero_advantage_batches: bool = True\n  # Sync LoRA adapter to vLLM via filesystem instead of merging + NCCL broadcast. Auto-\n  # selects vllm_serve_lora serve module. Syncs only LoRA adapter weights vs full merged\n  # model.\n  vllm_lora_sync: bool = False\n\nvllm: VllmConfig | None\n  # For VllmConfig:\n  # Device to use for VLLM\n  device: str | None = auto\n  # Tensor parallel size for VLLM\n  tensor_parallel_size: int | None\n  # Data parallel size for VLLM\n  data_parallel_size: int | None\n  # GPU memory utilization for VLLM\n  gpu_memory_utilization: float | None = 0.9\n  # Data type for VLLM\n  dtype: str | None = auto\n  # Maximum length of the model context for VLLM\n  max_model_len: int | None\n  # Enable prefix caching for VLLM\n  enable_prefix_caching: bool | None\n  # Host for the vLLM server to start on\n  host: str | None = 0.0.0.0\n  # Port of the vLLM server to start on\n  port: int | None = 8000\n\n  # Enable reasoning for VLLM\n  enable_reasoning: bool | None\n  # Reasoning parser for VLLM\n  reasoning_parser: str | None\n  # Disable CUDA graph capture in vLLM. Required for models with causal_conv1d (e.g.,\n  # Qwen3.5 hybrid linear attention).\n  enforce_eager: bool | None\n  # Python module for vLLM serve script. Set to 'axolotl.scripts.vllm_serve_lora' for\n  # native LoRA support, or leave None for default TRL serve.\n  serve_module: str | None\n  # vLLM worker extension class for weight synchronization. Defaults to\n  # 'trl.scripts.vllm_serve.WeightSyncWorkerExtension'.\n  worker_extension_cls: str | None\n\n# Configuration for Energy-Based Fine-Tuning (EBFT)\nebft: EBFTConfig | None\n  # For EBFTConfig:\n  # Fractional layer depths for feature extraction (e.g., [0.25, 0.5, 0.75])\n  feature_layers: list[float] = [0.25, 0.5, 0.75]\n  # Embedding method: 'last_token', 'mean_pooling', 'completion_mean', or 'concat'\n  embed_method: Literal['last_token', 'mean_pooling', 'completion_mean', 'concat'] = last_token\n  # Apply SVD whitening to feature embeddings\n  use_whitening: bool = False\n  # Coefficient for alignment reward (cosine similarity with ground truth)\n  alignment_coef: float = 1.0\n  # Coefficient for diversity penalty (pairwise similarity between samples)\n  diversity_coef: float = 1.0\n  # Cross-entropy loss coefficient on ground-truth tokens\n  ce_coef: float = 0.0\n  # Set per-batch max_tokens based on ground-truth length\n  adaptive_max_tokens: bool = True\n  # Multiplier for ground-truth token count when computing adaptive max_tokens\n  gt_length_multiplier: float = 1.5\n\n  # EBFT mode: 'structured' (QA with vLLM) or 'strided' (unstructured text)\n  mode: Literal['structured', 'strided'] = structured\n  # Stride between anchor points (tokens)\n  stride: int = 8\n  # Context window size per block\n  context_length: int = 8\n  # Tokens to generate per block\n  generate_max_len: int = 8\n  # Independent rollouts per document\n  n_samples_per_prompt: int = 4\n  # Sampling temperature for strided generation\n  temperature: float = 0.6\n  # Top-p nucleus sampling threshold\n  top_p: float = 1.0\n  # RL policy gradient loss coefficient\n  rl_coef: float = 1.0\n  # Advantage estimator: 'rloo', 'group_norm', 'reinforce'\n  advantage_estimator: Literal['rloo', 'group_norm', 'reinforce'] = rloo\n  # Minimum tokens into completion before placing anchors. Skips anchors too close to\n  # the prompt boundary where features are dominated by prompt context.\n  min_completion_prefix: int = 0\n\nqat: QATConfig | None\n  # For QATConfig:\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Quantize embedding\n  quantize_embedding: bool | None = False\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n  # The number of steps to apply fake quantization after\n  fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n  # For PTQConfig:\n  # Fake quantization layout to use for weight quantization.\n  weight_dtype: TorchAOQuantDType = TorchAOQuantDType.int8\n  # Fake quantization layout to use for activation quantization.\n  activation_dtype: TorchAOQuantDType | None\n  # Whether to quantize the embedding layer.\n  quantize_embedding: bool | None\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n\n# Configuration for dynamic checkpointing (trigger by file or signal). Set 'enabled:\n# true' to activate this feature.\ndynamic_checkpoint: DynamicCheckpointConfig | None\n  # For DynamicCheckpointConfig:\n  # Enable dynamic checkpoint triggering during training. Create a file\n  # 'axolotl_checkpoint.save' in the configured `output_dir` to trigger.\n  enabled: bool = False\n  # Check for trigger file every N steps (reduces I/O overhead). Default: 100\n  check_interval: int = 10\n  # Custom trigger filename (optional). If not specified, defaults to\n  # 'axolotl_checkpoint.save'. Specify a filename (not a full path) to override the\n  # default.\n  trigger_file_path: str = \n\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\n# Coefficient to incentivize the reward model to output mean-zero rewards (proposed by\n# https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.\ncenter_rewards_coefficient: float | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_label_smoothing: float | None\n# Precompute reference model log probabilities for DPO\nprecompute_ref_log_probs: bool | None\n\n# Whether to use Liger kernel for DPO loss.\ndpo_use_liger_kernel: bool | None\n\ndpo_padding_free: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n  # For SyntheticDataset:\n  path: Literal['synthetic'] = synthetic\n  type: Literal['_synthetic'] = _synthetic\n  # Number of rows to generate\n  length: int = 1000\n  # Sequence length per row (defaults to sequence_len from config)\n  sequence_length: int | None\n  # Minimum token ID for generation\n  min_input_id: int = 100\n  # Maximum token ID for generation (defaults to tokenizer vocab_size)\n  max_input_id: int | None\n  # Random seed for reproducibility\n  seed: int | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset | SyntheticDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n  # For SyntheticDataset:\n  path: Literal['synthetic'] = synthetic\n  type: Literal['_synthetic'] = _synthetic\n  # Number of rows to generate\n  length: int = 1000\n  # Sequence length per row (defaults to sequence_len from config)\n  sequence_length: int | None\n  # Minimum token ID for generation\n  min_input_id: int = 100\n  # Maximum token ID for generation (defaults to tokenizer vocab_size)\n  max_input_id: int | None\n  # Random seed for reproducibility\n  seed: int | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# If true, each dataset in `datasets` will be shuffled before merging. This allows\n# curriculum learning strategies to be applied at the dataset level. Default is false.\nshuffle_before_merging_datasets: bool | None = False\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n# Number of shards to save the prepared dataset\nnum_dataset_shards_to_save: int | None\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n  # For PretrainingDataset:\n  name: str | None\n  path: str | None\n  split: str | None = train\n  text_column: str | None = text\n  type: str | None = pretrain\n  trust_remote_code: bool | None = False\n  data_files: str | None\n  skip: int | None\n\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_processes: int | None\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_num_proc: int | None\n\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\n# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before\n# evaluations. Default is 0 (disabled).\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require &gt;=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\n# Enable FP8 mixed precision training using TorchAO. Best used in combination with\n# torch.compile.\nfp8: bool | None\n# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training\n# speed by 10-15% when FSDP is enabled.\nfp8_enable_fsdp_float8_all_gather: bool | None\n# No AMP (automatic mixed precision) - require &gt;=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# bool to use CUDA tf32 or 'auto' for automatic detection - require &gt;=ampere\ntf32: Literal['auto'] | bool | None = auto\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.\nactivation_offloading: Literal['legacy', 'disk'] | bool | None = False\n# Offload model layer parameters to CPU during forward, prefetch back during backward.\nlayer_offloading: bool | None = False\n\n# Freeze multimodal encoder parameters (vision, audio, etc.) for text-only training of\n# multimodal models. When True, parameters belonging to vision towers, audio towers,\n# multimodal projectors, and similar non-language modules are frozen\n# (requires_grad=False). This allows DDP training without\n# ddp_find_unused_parameters=True.\nfreeze_mm_modules: bool | None\n\n# List of regex patterns for parameter names to keep unfrozen. All other parameters will\n# be frozen via requires_grad=False. Note: range-based patterns (e.g.\n# embed_tokens.weight$[:32000]) use gradient zeroing rather than a true freeze, so\n# weight decay will still apply to the frozen portion and optimizer states are allocated\n# for the full parameter.\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;\n# 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to\n# 'drop' for backward compatibility.\nexcess_length_strategy: Literal['drop', 'truncate', 'raise'] | None\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int | None\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (&lt;%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to\n# True if `sample_packing` enabled\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use streaming mode for loading datasets\nstreaming: bool | None\n# Buffer size for multipack streaming datasets\nstreaming_multipack_buffer_size: int | None = 10000\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n# Whether to use SageAttention https://github.com/thu-ml/SageAttention\nsage_attention: bool | None\n\neager_attention: bool | None\n\n# Specify a custom attention implementation, used mostly for kernels.\nattn_implementation: str | None\n\n# Use hybrid attention for Gemma 4: flash_attention_2 for sliding window layers and sdpa\n# for global (full_attention) layers. Global layers have head_dim=512 which exceeds\n# flash attention's supported size.\ngemma4_hybrid_attn_impl: bool | None\n\n# Which experts implementation to use for MoE models,\nexperts_implementation: str | None\n\n# Quantize MoE expert weights on load to reduce VRAM. Requires adapter (lora/qlora) with\n# load_in_4bit or load_in_8bit. Requires CUDA (not compatible with ROCm or other\n# backends). Note: total parameter count may be reported incorrectly when enabled\n# (trainable param count is correct).\nquantize_moe_experts: bool = False\n\n# Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399\nscaling_softmax: bool | None\n# Scaling factor for SSMax attention. Default is 0.43\nscaling_softmax_factor: float | None\n# Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better\n# length generalization.\nscaling_softmax_bias: float | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n# Apply custom LoRA autograd function for embedding layers. See:\n# https://docs.axolotl.ai/docs/lora_optims.html\nlora_embedding_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n# Enable Entropy-Aware Focal Training loss (EAFT)\nuse_eaft: bool | None\n# Exponent for entropy weighting in EAFT (default: 1.0)\neaft_alpha: float | None = 1.0\n# Number of top logits for entropy approximation (default: 20)\neaft_k: int | None = 20\n\n# Whether to use ALST tiled mlp for memory efficient long context\ntiled_mlp: bool | None\n\n# Number of shards to use for ALST tiled mlp. If unset, it will be set based on\n# seqlen/hidden_size\ntiled_mlp_num_shards: int | None\n\n# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on\n# llama.\ntiled_mlp_use_original_mlp: bool | None = True\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# Whether to use deepcompile for faster training with deepspeed\ndeepcompile: bool | None\n# FSDP configuration\nfsdp: list[str] | None\n\n# FSDP configuration options\nfsdp_config: FSDPConfig | None\n  # For FSDPConfig:\n  # FSDP version\n  fsdp_version: int | None\n  # Enable activation checkpointing to reduce memory usage during forward passes\n  activation_checkpointing: bool | None\n  # Offload parameters to CPU to reduce GPU memory usage\n  offload_params: bool | None\n  # Synchronize module states across all processes\n  sync_module_states: bool | None\n  # Enable CPU RAM efficient loading to reduce memory usage during model loading\n  cpu_ram_efficient_loading: bool | None\n  # Disabling this enables swap memory usage for resource-constrained setups when\n  # offload_params is enabled.\n  cpu_offload_pin_memory: bool | None\n  # Use original parameters instead of flattened parameters\n  use_orig_params: bool | None\n\n  # Type of state dict to use for saving/loading checkpoints\n  state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n  # Final state dict type to use after training completion\n  final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n  # Policy for automatically wrapping modules with FSDP\n  auto_wrap_policy: Literal['TRANSFORMER_BASED_WRAP', 'SIZE_BASED_WRAP'] | None\n  # Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')\n  transformer_layer_cls_to_wrap: str | None\n\n  # Reshard parameters after forward pass to save memory\n  reshard_after_forward: bool | None\n  # Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')\n  mixed_precision_policy: str | None\n\n# FSDP version\nfsdp_version: int | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Number of devices to shard across. If not set, will use all available devices.\ndp_shard_size: int | None\n# Number of devices to replicate across.\ndp_replicate_size: int | None\n# Deprecated: use `context_parallel_size` instead\nsequence_parallel_degree: int | None\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\ncontext_parallel_size: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.\ntensor_parallel_size: int | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n  # For SpecialTokensConfig:\n  bos_token: str | None\n  eos_token: str | None\n  pad_token: str | None\n  unk_token: str | None\n  additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch&gt;=2.6.0\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Whether to checkpoint a model after the first step of training. Defaults to False.\nsave_first_step: bool | None\n\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# Which step to start the profiler at. Useful for only capturing a few steps mid-run.\nprofiler_steps_start: int | None = 0\n# bool of whether to report tokens per second at the end of training. This is not\n# supported with pre-training datasets.\ninclude_tokens_per_second: bool | None\n# bool of whether to report tokens per second per-gpu during training by measuring\n# throughput of non-padding tokens.\ninclude_tkps: bool | None = True\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template or path to jinja file for chat template. This will be only used\n# if chat_template is set to `jinja` or `null` (in which case chat_template is\n# automatically set to `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\n# Token index or indices to adjust embedding weights to the mean of the other tokens.\n# This is useful when the model has untrained embeddings.\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n# Enable sample generation during training for monitoring\ngenerate_samples: bool | None = False\n# Number of samples to generate at each interval\nnum_generation_samples: int | None = 3\n# Maximum new tokens to generate per sample\ngeneration_max_new_tokens: int | None = 50\n# Temperature for sample generation (0.0 = greedy)\ngeneration_temperature: float | None = 0.7\n# Nucleus sampling parameter for generation\ngeneration_top_p: float | None\n# Top-k sampling parameter for generation\ngeneration_top_k: int | None\n# Ratio of input to use as prompt (0.0-1.0)\ngeneration_prompt_ratio: float | None = 0.5\n# Whether to use sampling (vs greedy decoding)\ngeneration_do_sample: bool | None = True\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\n# transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to\n# AutoConfig.\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Whether to save jinja files for tokenizer, transformers default is True\ntokenizer_save_jinja_files: bool | None = True\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Don't move the model to the device before sharding. Set to `false` to revert to legacy\n# behavior.\nexperimental_skip_move_to_device: bool | None = True\n\n# Use custom kernels, e.g. MegaBlocks.\nuse_kernels: bool | None\n\n# Model loading quantization config\nmodel_quantization_config: Literal['Mxfp4Config', 'FineGrainedFP8Config'] | None\n# kwargs for model quantization config\nmodel_quantization_config_kwargs: dict[str, Any] | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# branch/revision to push to on hub (default: main)\nhub_revision: str | None\n# Whether to save the model using safetensors format. Defaults to True.\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all\n# parameters in original model\nadapter: Literal['lora', 'qlora', 'llama-adapter'] | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\nlora_target_parameters: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n  # For PeftConfig:\n  # Configuration options for loftq initialization for LoRA\n  loftq_config: LoftQConfig | None\n    # For LoftQConfig:\n    # typically 4 bits\n    loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n# A list of token indices to fine-tune on the `embed_tokens` layer. Otherwise, a dict\n# mapping an embedding layer name to its trainable token indices. See\n# https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-\n# tokens-alongside-lora\npeft_trainable_token_indices: list[int] | dict[str, list[int]] | None\n# Whether to tie adapter weights for tied model weights. See\n# https://github.com/huggingface/peft/issues/2864\npeft_ensure_weight_tying: bool | None\n# Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT.\npeft_autocast_adapter_dtype: bool | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n# Method to use for LoRA merging. 'memory_efficient' (default) processes shards\n# individually to reduce memory usage, 'legacy' loads the full model into memory.\nmerge_method: Literal['legacy', 'memory_efficient'] | None = memory_efficient\n\n# Whether to use ReLoRA. Use with jagged_restart_*steps options.\nrelora: bool | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# how often to reset for jagged restarts\njagged_restart_steps: int | None\n# how many warmup steps to take after reset for jagged restarts\njagged_restart_warmup_steps: int | None\n# how many anneal steps to take before reset for jagged restarts\njagged_restart_anneal_steps: int | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n  # For LrGroup:\n  name: str (required)\n  modules: list[str] (required)\n  lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n\n# Dion Optimizer learning rate\ndion_lr: float | None\n# Dion Optimizer momentum\ndion_momentum: float | None\n# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank\n# dimension.\ndion_rank_fraction: float | None = 1.0\n# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may\n# be useful to ensure even sharding.\ndion_rank_multiple_of: int | None = 1\n\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\nuse_trackio: bool | None\n# Your trackio project name\ntrackio_project_name: str | None\n# Set the name of your trackio run\ntrackio_run_name: str | None\n# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)\ntrackio_space_id: str | None\n\n# Enable OpenTelemetry metrics collection and Prometheus export\nuse_otel_metrics: bool | None = False\n# Host to bind the OpenTelemetry metrics server to\notel_metrics_host: str | None = localhost\n# Port for the Prometheus metrics HTTP server\notel_metrics_port: int | None = 8000\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None\neval_table_size: int | None\neval_max_new_tokens: int | None\ndpo_use_logits_to_keep: bool | None\ndpo_generate_during_eval: bool | None\ndpo_norm_loss: bool | None\nrpo_alpha: float | None",
     "crumbs": [
       "Getting Started",
       "Config Reference"
@@ -6074,7 +6067,7 @@
     "href": "docs/debugging.html#debugging-with-vscode",
     "title": "Debugging",
     "section": "Debugging with VSCode",
-    "text": "Debugging with VSCode\n\nBackground\nThe below example shows how to configure VSCode to debug data preprocessing of the chat_template format. This is the format used when you have the following in your axolotl config:\ndatasets:\n  - path: &lt;path to your chat_template formatted dataset&gt; # example on HF Hub: fozziethebeat/alpaca_messages_2k_test\n    type: chat_template\n\n[!Important]\nIf you are already familiar with advanced VSCode debugging, you can skip the below explanation and look at the files .vscode/launch.json and .vscode/tasks.json for an example configuration.\n\n\n[!Tip]\nIf you prefer to watch a video, rather than read, you can skip to the video tutorial below (but doing both is recommended).\n\n\n\nSetup\nMake sure you have an editable install of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\nRemote Hosts\nIf you developing on a remote host, you can easily use VSCode to debug remotely. To do so, you will need to follow this remote - SSH guide. You can also see the video below on Docker and Remote SSH debugging.\n\n\n\nConfiguration\nThe easiest way to get started is to modify the .vscode/launch.json file in this project. This is just an example configuration, so you may need to modify or copy it to suit your needs.\nFor example, to mimic the command cd devtools && CUDA_VISIBLE_DEVICES=0 axolotl train dev_chat_template.yml, you would use the below configuration1. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to devtools and set the env variable HF_HOME to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.\n// .vscode/launch.json\n{\n    \"version\": \"0.2.0\",\n    \"configurations\": [\n        {\n            \"name\": \"Debug axolotl prompt - chat_template\",\n            \"type\": \"python\",\n            \"module\": \"accelerate.commands.launch\",\n            \"request\": \"launch\",\n            \"args\": [\n                \"-m\", \"axolotl.cli.train\", \"dev_chat_template.yml\",\n                // The flags below simplify debugging by overriding the axolotl config\n                // with the debugging tips above.  Modify as needed.\n                \"--dataset_num_proc=1\",      // limits data preprocessing to one process\n                \"--max_steps=1\",              // limits training to just one step\n                \"--batch_size=1\",             // minimizes batch size\n                \"--micro_batch_size=1\",       // minimizes batch size\n                \"--val_set_size=0\",           // disables validation\n                \"--sample_packing=False\",     // disables sample packing which is necessary for small datasets\n                \"--eval_sample_packing=False\",// disables sample packing on eval set\n                \"--dataset_prepared_path=temp_debug/axolotl_outputs/data\", // send data outputs to a temp folder\n                \"--output_dir=temp_debug/axolotl_outputs/model\" // send model outputs to a temp folder\n                ],\n            \"console\": \"integratedTerminal\",      // show output in the integrated terminal\n            \"cwd\": \"${workspaceFolder}/devtools\", // set working directory to devtools from the root of the project\n            \"justMyCode\": true,                   // step through only axolotl code\n            \"env\": {\"CUDA_VISIBLE_DEVICES\": \"0\",  // Since we aren't doing distributed training, we need to limit to one GPU\n                    \"HF_HOME\": \"${workspaceFolder}/devtools/temp_debug/.hf-cache\"}, // send HF cache to a temp folder\n            \"preLaunchTask\": \"cleanup-for-dataprep\", // delete temp folders (see below)\n        }\n    ]\n}\nAdditional notes about this configuration:\n\nThe argument justMyCode is set to true such that you step through only the axolotl code. If you want to step into dependencies, set this to false.\nThe preLaunchTask: cleanup-for-dataprep is defined in .vscode/tasks.json and is used to delete the following folders before debugging, which is essential to ensure that the data pre-processing code is run from scratch:\n\n./devtools/temp_debug/axolotl_outputs\n./devtools/temp_debug/.hf-cache/datasets\n\n\n\n[!Tip]\nYou may not want to delete these folders. For example, if you are debugging model training instead of data pre-processing, you may NOT want to delete the cache or output folders. You may also need to add additional tasks to the tasks.json file depending on your use case.\n\nBelow is the ./vscode/tasks.json file that defines the cleanup-for-dataprep task. This task is run before each debugging session when you use the above configuration. Note how there are two tasks that delete the two folders mentioned above. The third task cleanup-for-dataprep is a composite task that combines the two tasks. A composite task is necessary because VSCode does not allow you to specify multiple tasks in the preLaunchTask argument of the launch.json file.\n// .vscode/tasks.json\n// this file is used by launch.json\n{\n    \"version\": \"2.0.0\",\n    \"tasks\": [\n      // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder\n      {\n        \"label\": \"delete-outputs\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/axolotl_outputs\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n      // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder\n      {\n        \"label\": \"delete-temp-hf-dataset-cache\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/.hf-cache/datasets\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n        // this task combines the two tasks above\n      {\n       \"label\": \"cleanup-for-dataprep\",\n       \"dependsOn\": [\"delete-outputs\", \"delete-temp-hf-dataset-cache\"],\n      }\n    ]\n}\n\n\nCustomizing your debugger\nYour debugging use case may differ from the example above. The easiest thing to do is to put your own axolotl config in the devtools folder and modify the launch.json file to use your config. You may also want to modify the preLaunchTask to delete different folders or not delete anything at all.\n\n\nVideo Tutorial\nThe following video tutorial walks through the above configuration and demonstrates how to debug with VSCode, (click the image below to watch):\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl w/VSCode",
+    "text": "Debugging with VSCode\n\nBackground\nThe below example shows how to configure VSCode to debug data preprocessing of the chat_template format. This is the format used when you have the following in your axolotl config:\ndatasets:\n  - path: &lt;path to your chat_template formatted dataset&gt; # example on HF Hub: fozziethebeat/alpaca_messages_2k_test\n    type: chat_template\n\n[!Important]\nIf you are already familiar with advanced VSCode debugging, you can skip the below explanation and look at the files .vscode/launch.json and .vscode/tasks.json for an example configuration.\n\n\n[!Tip]\nIf you prefer to watch a video, rather than read, you can skip to the video tutorial below (but doing both is recommended).\n\n\n\nSetup\nMake sure you have an editable install of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:\nexport UV_TORCH_BACKEND=cu128  # or cu130\nuv sync --extra flash-attn --extra deepspeed --group dev --group test\nsource .venv/bin/activate\n\nRemote Hosts\nIf you developing on a remote host, you can easily use VSCode to debug remotely. To do so, you will need to follow this remote - SSH guide. You can also see the video below on Docker and Remote SSH debugging.\n\n\n\nConfiguration\nThe easiest way to get started is to modify the .vscode/launch.json file in this project. This is just an example configuration, so you may need to modify or copy it to suit your needs.\nFor example, to mimic the command cd devtools && CUDA_VISIBLE_DEVICES=0 axolotl train dev_chat_template.yml, you would use the below configuration1. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to devtools and set the env variable HF_HOME to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.\n// .vscode/launch.json\n{\n    \"version\": \"0.2.0\",\n    \"configurations\": [\n        {\n            \"name\": \"Debug axolotl prompt - chat_template\",\n            \"type\": \"python\",\n            \"module\": \"accelerate.commands.launch\",\n            \"request\": \"launch\",\n            \"args\": [\n                \"-m\", \"axolotl.cli.train\", \"dev_chat_template.yml\",\n                // The flags below simplify debugging by overriding the axolotl config\n                // with the debugging tips above.  Modify as needed.\n                \"--dataset_num_proc=1\",      // limits data preprocessing to one process\n                \"--max_steps=1\",              // limits training to just one step\n                \"--batch_size=1\",             // minimizes batch size\n                \"--micro_batch_size=1\",       // minimizes batch size\n                \"--val_set_size=0\",           // disables validation\n                \"--sample_packing=False\",     // disables sample packing which is necessary for small datasets\n                \"--eval_sample_packing=False\",// disables sample packing on eval set\n                \"--dataset_prepared_path=temp_debug/axolotl_outputs/data\", // send data outputs to a temp folder\n                \"--output_dir=temp_debug/axolotl_outputs/model\" // send model outputs to a temp folder\n                ],\n            \"console\": \"integratedTerminal\",      // show output in the integrated terminal\n            \"cwd\": \"${workspaceFolder}/devtools\", // set working directory to devtools from the root of the project\n            \"justMyCode\": true,                   // step through only axolotl code\n            \"env\": {\"CUDA_VISIBLE_DEVICES\": \"0\",  // Since we aren't doing distributed training, we need to limit to one GPU\n                    \"HF_HOME\": \"${workspaceFolder}/devtools/temp_debug/.hf-cache\"}, // send HF cache to a temp folder\n            \"preLaunchTask\": \"cleanup-for-dataprep\", // delete temp folders (see below)\n        }\n    ]\n}\nAdditional notes about this configuration:\n\nThe argument justMyCode is set to true such that you step through only the axolotl code. If you want to step into dependencies, set this to false.\nThe preLaunchTask: cleanup-for-dataprep is defined in .vscode/tasks.json and is used to delete the following folders before debugging, which is essential to ensure that the data pre-processing code is run from scratch:\n\n./devtools/temp_debug/axolotl_outputs\n./devtools/temp_debug/.hf-cache/datasets\n\n\n\n[!Tip]\nYou may not want to delete these folders. For example, if you are debugging model training instead of data pre-processing, you may NOT want to delete the cache or output folders. You may also need to add additional tasks to the tasks.json file depending on your use case.\n\nBelow is the ./vscode/tasks.json file that defines the cleanup-for-dataprep task. This task is run before each debugging session when you use the above configuration. Note how there are two tasks that delete the two folders mentioned above. The third task cleanup-for-dataprep is a composite task that combines the two tasks. A composite task is necessary because VSCode does not allow you to specify multiple tasks in the preLaunchTask argument of the launch.json file.\n// .vscode/tasks.json\n// this file is used by launch.json\n{\n    \"version\": \"2.0.0\",\n    \"tasks\": [\n      // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder\n      {\n        \"label\": \"delete-outputs\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/axolotl_outputs\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n      // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder\n      {\n        \"label\": \"delete-temp-hf-dataset-cache\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/.hf-cache/datasets\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n        // this task combines the two tasks above\n      {\n       \"label\": \"cleanup-for-dataprep\",\n       \"dependsOn\": [\"delete-outputs\", \"delete-temp-hf-dataset-cache\"],\n      }\n    ]\n}\n\n\nCustomizing your debugger\nYour debugging use case may differ from the example above. The easiest thing to do is to put your own axolotl config in the devtools folder and modify the launch.json file to use your config. You may also want to modify the preLaunchTask to delete different folders or not delete anything at all.\n\n\nVideo Tutorial\nThe following video tutorial walks through the above configuration and demonstrates how to debug with VSCode, (click the image below to watch):\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl w/VSCode",
     "crumbs": [
       "Troubleshooting",
       "Debugging"
@@ -6085,7 +6078,7 @@
     "href": "docs/debugging.html#debugging-with-docker",
     "title": "Debugging",
     "section": "Debugging With Docker",
-    "text": "Debugging With Docker\nUsing official Axolotl Docker images is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps.\n\nSetup\nOn the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\n\n[!Tip]\nIf you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project.\n\nNext, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:2\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1\n\n[!Tip]\nTo understand which containers are available, see the Docker section of the README and the DockerHub repo. For details of how the Docker containers are built, see axolotl’s Docker CI builds.\n\nYou will now be in the container. Next, perform an editable install of Axolotl:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\nAttach To Container\nNext, if you are using a remote host, Remote into this host with VSCode. If you are using a local host, you can skip this step.\nNext, select Dev Containers: Attach to Running Container... using the command palette (CMD + SHIFT + P) in VSCode. You will be prompted to select a container to attach to. Select the container you just created. You will now be in the container with a working directory that is at the root of the project. Any changes you make to the code will be reflected both in the container and on the host.\nNow you are ready to debug as described above (see Debugging with VSCode).\n\n\nVideo - Attaching To Docker On Remote Host\nHere is a short video that demonstrates how to attach to a Docker container on a remote host:\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl Part 2: Attaching to Docker on a Remote Host",
+    "text": "Debugging With Docker\nUsing official Axolotl Docker images is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps.\n\nSetup\nOn the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\n\n[!Tip]\nIf you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project.\n\nNext, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:2\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl-uv:main-latest\n\n[!Tip]\nTo understand which containers are available, see the Docker section of the README and the DockerHub repo. For details of how the Docker containers are built, see axolotl’s Docker CI builds.\n\nYou will now be in the container. Next, install Axolotl with dev dependencies:\nuv sync --extra flash-attn --extra deepspeed --group dev --group test\nsource .venv/bin/activate\n\n\nAttach To Container\nNext, if you are using a remote host, Remote into this host with VSCode. If you are using a local host, you can skip this step.\nNext, select Dev Containers: Attach to Running Container... using the command palette (CMD + SHIFT + P) in VSCode. You will be prompted to select a container to attach to. Select the container you just created. You will now be in the container with a working directory that is at the root of the project. Any changes you make to the code will be reflected both in the container and on the host.\nNow you are ready to debug as described above (see Debugging with VSCode).\n\n\nVideo - Attaching To Docker On Remote Host\nHere is a short video that demonstrates how to attach to a Docker container on a remote host:\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl Part 2: Attaching to Docker on a Remote Host",
     "crumbs": [
       "Troubleshooting",
       "Debugging"
@@ -6133,7 +6126,7 @@
     "href": "docs/models/magistral/vision.html#getting-started",
     "title": "Magistral Vision",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall the required vision lib:\nbash  pip install 'mistral-common[opencv]==1.8.5'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/magistral/vision/magistral-small-vision-24B-qlora.yml\n\nThis config uses about 17GiB VRAM.\nWARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTips\nKey differences from text-only model:\n- max_tokens: 131072 for inference\n- Multi-modal dataset format required\n- Sample packing not supported",
+    "text": "Getting started\n\nInstall the required vision lib:\nbash  uv pip install 'mistral-common[opencv]==1.8.5'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/magistral/vision/magistral-small-vision-24B-qlora.yml\n\nThis config uses about 17GiB VRAM.\nWARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.\n\nTips\nKey differences from text-only model:\n- max_tokens: 131072 for inference\n- Multi-modal dataset format required\n- Sample packing not supported",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -6232,7 +6225,7 @@
     "href": "docs/models/gpt-oss.html#getting-started",
     "title": "GPT-OSS",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nChoose one of the following configs below for training the 20B model. (for 120B, see below)\n\n# LoRA SFT linear layers (1x48GB @ ~44GiB)\naxolotl train examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml\n\n# FFT SFT with offloading (2x24GB @ ~21GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml\n\n# FFT SFT (8x48GB @ ~36GiB/GPU or 4x80GB @ ~46GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml\nNote: Memory usage taken from device_mem_reserved(gib) from logs.\n\nTraining 120B\nOn 8xH100s, make sure you have ~3TB of free disk space. With each checkpoint clocking in at ~720GB, along with the base\nmodel, and final model output, you may need at least 3TB of free disk space to keep at least 2 checkpoints.\n# FFT SFT with offloading (8x80GB @ ~49GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml\nTo simplify fine-tuning across 2 nodes × 8x H100 (80GB) GPUs, we’ve partnered with Baseten to showcase multi-node\ntraining of the 120B model using Baseten Truss. You can read more about this recipe on\nBaseten’s blog. The recipe can\nbe found on their\nGitHub.\nERRATA: Transformers saves the model Architecture prefixed with FSDP which needs to be manually renamed in config.json.\nSee https://github.com/huggingface/transformers/pull/40207 for the status of this issue.\nsed -i 's/FSDPGptOssForCausalLM/GptOssForCausalLM/g' ./outputs/gpt-oss-out/config.json\nWhen using SHARDED_STATE_DICT with FSDP, the final checkpoint should automatically merge the sharded weights to your\nconfigured output_dir. However, if that step fails due to a disk space error, you can take an additional step to\nmerge the sharded weights. This step will automatically determine the last checkpoint directory and merge the sharded\nweights to {output_dir}/merged.\naxolotl merge-sharded-fsdp-weights examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml\nmv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/\n\n\nHow to set reasoning_effort in template?\nThe harmony template has a feature to set the reasoning_effort during prompt building. The default is medium. If you would like to adjust this, you can add the following to your config:\nchat_template_kwargs:\n  reasoning_effort: \"high\"  # low | medium | high\nCurrently, this applies globally. There is no method to apply per sample yet. If you are interested in adding this, please feel free to create an Issue to discuss.\n\n\nInferencing your fine-tuned model\n\nvLLM\nGPT-OSS support in vLLM does not exist in a stable release yet. See https://x.com/MaziyarPanahi/status/1955741905515323425\nfor more information about using a special vllm-openai docker image for inferencing with vLLM.\nOptionally, vLLM can be installed from nightly:\npip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly\nand the vLLM server can be started with the following command (modify --tensor-parallel-size 8 to match your environment):\nvllm serve ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-20b --host 0.0.0.0 --port 8888  --tensor-parallel-size 8\n\n\nSGLang\nSGLang has 0-day support in main, see https://github.com/sgl-project/sglang/issues/8833 for infomation on installing\nSGLang from source. Once you’ve installed SGLang, run the following command to launch a SGLang server:\npython3 -m sglang.launch_server --model ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-120b --host 0.0.0.0 --port 8888 --tp 8\n\n\n\nTool use\nGPT-OSS has a comprehensive tool understanding. Axolotl supports tool calling datasets for Supervised Fine-tuning.\nHere is an example dataset config:\ndatasets:\n  - path: Nanobit/text-tools-2k-test\n    type: chat_template\nSee Nanobit/text-tools-2k-test for the sample dataset.\nRefer to our docs for more info.\n\n\nThinking and chat_template masking conflict\nOpenAI’s Harmony template hides thinking in all non-final turns, which conflicts with Axolotl’s chat_template masking.\nIf your dataset has thinking content mid-turn, there are two paths we recommend:\n\nTrain only on the last turn. This can be accomplished via chat_template’s train on last doc.\nAdjust your dataset to only have thinking content in the last turn.\n\n\n\nTIPS\n\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\nuv pip install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nChoose one of the following configs below for training the 20B model. (for 120B, see below)\n\n# LoRA SFT linear layers (1x48GB @ ~44GiB)\naxolotl train examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml\n\n# FFT SFT with offloading (2x24GB @ ~21GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml\n\n# FFT SFT (8x48GB @ ~36GiB/GPU or 4x80GB @ ~46GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml\nNote: Memory usage taken from device_mem_reserved(gib) from logs.\n\nTraining 120B\nOn 8xH100s, make sure you have ~3TB of free disk space. With each checkpoint clocking in at ~720GB, along with the base\nmodel, and final model output, you may need at least 3TB of free disk space to keep at least 2 checkpoints.\n# FFT SFT with offloading (8x80GB @ ~49GiB/GPU)\naxolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml\nTo simplify fine-tuning across 2 nodes × 8x H100 (80GB) GPUs, we’ve partnered with Baseten to showcase multi-node\ntraining of the 120B model using Baseten Truss. You can read more about this recipe on\nBaseten’s blog. The recipe can\nbe found on their\nGitHub.\nERRATA: Transformers saves the model Architecture prefixed with FSDP which needs to be manually renamed in config.json.\nSee https://github.com/huggingface/transformers/pull/40207 for the status of this issue.\nsed -i 's/FSDPGptOssForCausalLM/GptOssForCausalLM/g' ./outputs/gpt-oss-out/config.json\nWhen using SHARDED_STATE_DICT with FSDP, the final checkpoint should automatically merge the sharded weights to your\nconfigured output_dir. However, if that step fails due to a disk space error, you can take an additional step to\nmerge the sharded weights. This step will automatically determine the last checkpoint directory and merge the sharded\nweights to {output_dir}/merged.\naxolotl merge-sharded-fsdp-weights examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml\nmv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/\n\n\nHow to set reasoning_effort in template?\nThe harmony template has a feature to set the reasoning_effort during prompt building. The default is medium. If you would like to adjust this, you can add the following to your config:\nchat_template_kwargs:\n  reasoning_effort: \"high\"  # low | medium | high\nCurrently, this applies globally. There is no method to apply per sample yet. If you are interested in adding this, please feel free to create an Issue to discuss.\n\n\nInferencing your fine-tuned model\n\nvLLM\nGPT-OSS support in vLLM does not exist in a stable release yet. See https://x.com/MaziyarPanahi/status/1955741905515323425\nfor more information about using a special vllm-openai docker image for inferencing with vLLM.\nOptionally, vLLM can be installed from nightly:\nuv pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly\nand the vLLM server can be started with the following command (modify --tensor-parallel-size 8 to match your environment):\nvllm serve ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-20b --host 0.0.0.0 --port 8888  --tensor-parallel-size 8\n\n\nSGLang\nSGLang has 0-day support in main, see https://github.com/sgl-project/sglang/issues/8833 for infomation on installing\nSGLang from source. Once you’ve installed SGLang, run the following command to launch a SGLang server:\npython3 -m sglang.launch_server --model ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-120b --host 0.0.0.0 --port 8888 --tp 8\n\n\n\nTool use\nGPT-OSS has a comprehensive tool understanding. Axolotl supports tool calling datasets for Supervised Fine-tuning.\nHere is an example dataset config:\ndatasets:\n  - path: Nanobit/text-tools-2k-test\n    type: chat_template\nSee Nanobit/text-tools-2k-test for the sample dataset.\nRefer to our docs for more info.\n\n\nThinking and chat_template masking conflict\nOpenAI’s Harmony template hides thinking in all non-final turns, which conflicts with Axolotl’s chat_template masking.\nIf your dataset has thinking content mid-turn, there are two paths we recommend:\n\nTrain only on the last turn. This can be accomplished via chat_template’s train on last doc.\nAdjust your dataset to only have thinking content in the last turn.\n\n\n\nTIPS\n\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -6280,7 +6273,7 @@
     "href": "docs/models/LiquidAI.html#getting-started",
     "title": "Liquid Foundation Models 2",
     "section": "Getting Started",
-    "text": "Getting Started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\npip3 install packaging setuptools wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\nRun one of the finetuning examples below.\nLFM2\n# FFT SFT (1x48GB @ 25GiB)\naxolotl train examples/LiquidAI/lfm2-350m-fft.yaml\nLFM2-VL\n# LoRA SFT (1x48GB @ 2.7GiB)\naxolotl train examples/LiquidAI/lfm2-vl-lora.yaml\nLFM2-MoE\npip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6\n\n# LoRA SFT (1x48GB @ 16.2GiB)\naxolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml\n\n\nTIPS\n\nInstallation Error: If you encounter ImportError: ... undefined symbol ... or ModuleNotFoundError: No module named 'causal_conv1d_cuda', the causal-conv1d package may have been installed incorrectly. Try uninstalling it:\npip uninstall -y causal-conv1d\nDataset Loading: Read more on how to load your own dataset in our documentation.\nDataset Formats:\n\nFor LFM2 models, the dataset format follows the OpenAI Messages format as seen here.\nFor LFM2-VL models, Axolotl follows the multi-content Messages format. See our Multimodal docs for details.",
+    "text": "Getting Started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n# Ensure you have a compatible version of Pytorch installed\nuv pip install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\nRun one of the finetuning examples below.\nLFM2\n# FFT SFT (1x48GB @ 25GiB)\naxolotl train examples/LiquidAI/lfm2-350m-fft.yaml\nLFM2-VL\n# LoRA SFT (1x48GB @ 2.7GiB)\naxolotl train examples/LiquidAI/lfm2-vl-lora.yaml\nLFM2-MoE\nuv pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6\n\n# LoRA SFT (1x48GB @ 16.2GiB)\naxolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml\n\n\nTIPS\n\nInstallation Error: If you encounter ImportError: ... undefined symbol ... or ModuleNotFoundError: No module named 'causal_conv1d_cuda', the causal-conv1d package may have been installed incorrectly. Try uninstalling it:\nuv pip uninstall causal-conv1d\nDataset Loading: Read more on how to load your own dataset in our documentation.\nDataset Formats:\n\nFor LFM2 models, the dataset format follows the OpenAI Messages format as seen here.\nFor LFM2-VL models, Axolotl follows the multi-content Messages format. See our Multimodal docs for details.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -6328,7 +6321,7 @@
     "href": "docs/models/granite4.html#getting-started",
     "title": "Granite 4",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as Granite4 is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.7.1 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/granite4/granite-4.0-tiny-fft.yaml\nThis config uses about 40.8GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\n\n\n\nLimitation\nAdapter finetuning does not work at the moment. It would error with\nRuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x3072 and 1x1179648)\nIn addition, if adapter training works, lora_target_linear: true will not work due to:\nValueError: Target module GraniteMoeHybridParallelExperts() is not supported.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide. You need to install from main as Granite4 is only on nightly or use our latest Docker images.\nHere is an example of how to install from main for pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.7.1 min)\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\n\nuv pip install --no-build-isolation -e '.[flash-attn]'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/granite4/granite-4.0-tiny-fft.yaml\nThis config uses about 40.8GiB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nRead more on how to load your own dataset at docs.\nThe dataset format follows the OpenAI Messages format as seen here.\n\n\n\nLimitation\nAdapter finetuning does not work at the moment. It would error with\nRuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x3072 and 1x1179648)\nIn addition, if adapter training works, lora_target_linear: true will not work due to:\nValueError: Target module GraniteMoeHybridParallelExperts() is not supported.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -6376,7 +6369,7 @@
     "href": "docs/models/voxtral.html#getting-started",
     "title": "Voxtral",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nPlease install the below.\n\n# audio\npip3 install librosa==0.11.0\npip3 install 'mistral_common[audio]==1.8.3'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nDownload sample dataset files\n\n# for text + audio only\nwget https://huggingface.co/datasets/Nanobit/text-audio-2k-test/resolve/main/En-us-African_elephant.oga\n\nRun the finetuning example:\n\n# text only\naxolotl train examples/voxtral/voxtral-mini-qlora.yml\n\n# text + audio\naxolotl train examples/voxtral/voxtral-mini-audio-qlora.yml\nThese configs use about 4.8 GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official MistralAI team recommends temperature: 0.2 and top_p: 0.95 for audio understanding and temperature: 0.0 for transcription.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\nThe multimodal dataset format follows the OpenAI multi-content Messages format as seen here.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.6.0 min)\nuv pip install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nPlease install the below.\n\n# audio\nuv pip install librosa==0.11.0\nuv pip install 'mistral_common[audio]==1.8.3'\n\n# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy\npython scripts/cutcrossentropy_install.py | sh\n\nDownload sample dataset files\n\n# for text + audio only\nwget https://huggingface.co/datasets/Nanobit/text-audio-2k-test/resolve/main/En-us-African_elephant.oga\n\nRun the finetuning example:\n\n# text only\naxolotl train examples/voxtral/voxtral-mini-qlora.yml\n\n# text + audio\naxolotl train examples/voxtral/voxtral-mini-audio-qlora.yml\nThese configs use about 4.8 GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nTIPS\n\nFor inference, the official MistralAI team recommends temperature: 0.2 and top_p: 0.95 for audio understanding and temperature: 0.0 for transcription.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\nThe multimodal dataset format follows the OpenAI multi-content Messages format as seen here.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -6460,7 +6453,7 @@
     "href": "docs/models/mistral-small.html#getting-started",
     "title": "Mistral Small 3.1/3.2",
     "section": "Getting Started",
-    "text": "Getting Started\n\nInstall the required vision lib:\nbash  pip install 'mistral-common[opencv]==1.8.5'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml\n\nThis config uses about 29.4 GiB VRAM.",
+    "text": "Getting Started\n\nInstall the required vision lib:\nbash  uv pip install 'mistral-common[opencv]==1.8.5'\nDownload the example dataset image:\nwget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg\nRun the fine-tuning:\naxolotl train examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml\n\nThis config uses about 29.4 GiB VRAM.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -6748,7 +6741,7 @@
     "href": "docs/models/magistral.html#getting-started",
     "title": "Magistral",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.7.0 min)\npip3 install packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nInstall Cut Cross Entropy to reduce training VRAM usage\n\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/magistral/magistral-small-qlora.yaml\nThis config uses about 24GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nThinking\nMistralAI has released their 2507 model with thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.\n📚 See the Thinking fine-tuning guide →\n\n\nVision\nMistralAI has released their 2509 model with vision capabilities.\n📚 See the Vision fine-tuning guide →\n\n\nTips\n\nWe recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo’s files titled SYSTEM_PROMPT.txt.\nFor inference, the official MistralAI team recommends top_p: 0.95 and temperature: 0.7 with max_tokens: 40960.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.",
+    "text": "Getting started\n\nInstall Axolotl following the installation guide.\nHere is an example of how to install from pip:\n\n# Ensure you have Pytorch installed (Pytorch 2.7.0 min)\nuv pip install --no-build-isolation 'axolotl[flash-attn]&gt;=0.12.0'\n\nInstall Cut Cross Entropy to reduce training VRAM usage\n\npython scripts/cutcrossentropy_install.py | sh\n\nRun the finetuning example:\n\naxolotl train examples/magistral/magistral-small-qlora.yaml\nThis config uses about 24GB VRAM.\nLet us know how it goes. Happy finetuning! 🚀\n\nThinking\nMistralAI has released their 2507 model with thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.\n📚 See the Thinking fine-tuning guide →\n\n\nVision\nMistralAI has released their 2509 model with vision capabilities.\n📚 See the Vision fine-tuning guide →\n\n\nTips\n\nWe recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo’s files titled SYSTEM_PROMPT.txt.\nFor inference, the official MistralAI team recommends top_p: 0.95 and temperature: 0.7 with max_tokens: 40960.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -6826,7 +6819,7 @@
     "href": "docs/models/ministral3.html#getting-started",
     "title": "Ministral3",
     "section": "Getting started",
-    "text": "Getting started\n\nInstall Axolotl from source following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nSwap to the Axolotl transformers v5 branch\ncp examples/ministral3/ministral3-3b-qlora.yaml ministral3-3b-qlora.yaml\n\ngit fetch\ngit checkout transformers-v5\n\n# Install packages for transformers v5\npip install -e .\nRun the fine-tuning:\naxolotl train ministral3-3b-qlora.yaml\n\nLet us know how it goes. Happy finetuning! 🚀\n\nTips\n\nWe recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo’s files titled SYSTEM_PROMPT.txt.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\n\n\n\nThinking\nMinistral3 2512 model supports thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.\n📚 See the Thinking fine-tuning guide →\n\n\nVision\nMinistral3 2512 model also supports vision capabilities.\n📚 See the Vision fine-tuning guide →",
+    "text": "Getting started\n\nInstall Axolotl from source following the installation guide.\nInstall Cut Cross Entropy to reduce training VRAM usage.\nSwap to the Axolotl transformers v5 branch\ncp examples/ministral3/ministral3-3b-qlora.yaml ministral3-3b-qlora.yaml\n\ngit fetch\ngit checkout transformers-v5\n\n# Install packages for transformers v5\nuv pip install -e .\nRun the fine-tuning:\naxolotl train ministral3-3b-qlora.yaml\n\nLet us know how it goes. Happy finetuning! 🚀\n\nTips\n\nWe recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo’s files titled SYSTEM_PROMPT.txt.\nYou can run a full finetuning by removing the adapter: qlora and load_in_4bit: true from the config.\nRead more on how to load your own dataset at docs.\nThe text dataset format follows the OpenAI Messages format as seen here.\n\n\n\nThinking\nMinistral3 2512 model supports thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.\n📚 See the Thinking fine-tuning guide →\n\n\nVision\nMinistral3 2512 model also supports vision capabilities.\n📚 See the Vision fine-tuning guide →",
     "crumbs": [
       "Getting Started",
       "Model Guides",
@@ -6969,7 +6962,7 @@
     "href": "index.html#quick-start---llm-fine-tuning-in-minutes",
     "title": "Axolotl",
     "section": "🚀 Quick Start - LLM Fine-tuning in Minutes",
-    "text": "🚀 Quick Start - LLM Fine-tuning in Minutes\nRequirements:\n\nNVIDIA GPU (Ampere or newer for bf16 and Flash Attention) or AMD GPU\nPython &gt;=3.11 (3.12 recommended)\nPyTorch ≥2.9.1\n\n\nGoogle Colab\n\n\n\nOpen In Colab\n\n\n\n\nInstallation\n\nUsing uv (recommended)\n# install uv if you don't already have it installed\ncurl -LsSf https://astral.sh/uv/install.sh | sh\nsource $HOME/.local/bin/env\n\n# CUDA 12.8.1 tends to have better package compatibility\nexport UV_TORCH_BACKEND=cu128\n\n# create a new virtual environment\nuv venv --python 3.12\nsource .venv/bin/activate\n\nuv pip install torch==2.10.0 torchvision\nuv pip install --no-build-isolation axolotl[deepspeed]\n\n# recommended - install cut-cross-entropy\nuv pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@main\"\n\n# (optional) - prefetch flash-attn2 and causal-conv1d kernels\nuv run --python 3.12 python -c \"from kernels import get_kernel; get_kernel('kernels-community/flash-attn2'); get_kernel('kernels-community/causal-conv1d')\"\n\n# Download example axolotl configs, deepspeed configs\naxolotl fetch examples\naxolotl fetch deepspeed_configs  # OPTIONAL\n\n\nUsing pip\npip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\n\n# Download example axolotl configs, deepspeed configs\naxolotl fetch examples\naxolotl fetch deepspeed_configs  # OPTIONAL\n\n\nUsing Docker\nInstalling with Docker can be less error prone than installing in your own environment.\ndocker run --gpus '\"all\"' --rm -it axolotlai/axolotl:main-latest\nOther installation approaches are described here.\n\n\nCloud Providers\n\n\nRunPod\nVast.ai\nPRIME Intellect\nModal\nNovita\nJarvisLabs.ai\nLatitude.sh\n\n\n\n\n\nYour First Fine-tune\n# Fetch axolotl examples\naxolotl fetch examples\n\n# Or, specify a custom path\naxolotl fetch examples --dest path/to/folder\n\n# Train a model using LoRA\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Check out our Getting Started Guide for a more detailed walkthrough.",
+    "text": "🚀 Quick Start - LLM Fine-tuning in Minutes\nRequirements:\n\nNVIDIA GPU (Ampere or newer for bf16 and Flash Attention) or AMD GPU\nPython &gt;=3.11 (3.12 recommended)\nPyTorch ≥2.9.1\n\n\nGoogle Colab\n\n\n\nOpen In Colab\n\n\n\n\nInstallation\n# install uv if you don't already have it installed (restart shell after)\ncurl -LsSf https://astral.sh/uv/install.sh | sh\n\n# change depending on system\nexport UV_TORCH_BACKEND=cu128\n\n# create a new virtual environment\nuv venv --python 3.12\nsource .venv/bin/activate\n\nuv pip install torch==2.10.0 torchvision\nuv pip install --no-build-isolation axolotl[deepspeed]\n\n# Download example axolotl configs, deepspeed configs\naxolotl fetch examples\naxolotl fetch deepspeed_configs  # OPTIONAL\n\nUsing Docker\nInstalling with Docker can be less error prone than installing in your own environment.\ndocker run --gpus '\"all\"' --ipc=host --rm -it axolotlai/axolotl:main-latest\nOther installation approaches are described here.\n\n\nCloud Providers\n\n\nRunPod\nVast.ai\nPRIME Intellect\nModal\nNovita\nJarvisLabs.ai\nLatitude.sh\n\n\n\n\n\nYour First Fine-tune\n# Fetch axolotl examples\naxolotl fetch examples\n\n# Or, specify a custom path\naxolotl fetch examples --dest path/to/folder\n\n# Train a model using LoRA\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Check out our Getting Started Guide for a more detailed walkthrough.",
     "crumbs": [
       "Home"
     ]
diff --git a/sitemap.xml b/sitemap.xml
index 592b79ca4..4fa7a8430 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,990 +2,982 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://docs.axolotl.ai/FAQS.html</loc>
-    <lastmod>2026-04-15T13:27:31.200Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.738Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/qat.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/seed-oss.html</loc>
-    <lastmod>2026-04-15T13:31:18.100Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.806Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/internvl3_5.html</loc>
-    <lastmod>2026-04-15T13:31:18.092Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.799Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/apertus.html</loc>
-    <lastmod>2026-04-15T13:31:18.099Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.805Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/mistral.html</loc>
-    <lastmod>2026-04-15T13:31:18.097Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.804Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/smolvlm2.html</loc>
-    <lastmod>2026-04-15T13:31:18.101Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.806Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/arcee.html</loc>
-    <lastmod>2026-04-15T13:31:18.093Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/ministral3/vision.html</loc>
-    <lastmod>2026-04-15T13:31:18.094Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/kimi-linear.html</loc>
-    <lastmod>2026-04-15T13:31:18.091Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.798Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/mimo.html</loc>
-    <lastmod>2026-04-15T13:31:18.091Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.799Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/phi.html</loc>
-    <lastmod>2026-04-15T13:31:18.100Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.806Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/qwen3.html</loc>
-    <lastmod>2026-04-15T13:31:18.099Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.805Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/hunyuan.html</loc>
-    <lastmod>2026-04-15T13:31:18.102Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.807Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/ministral.html</loc>
-    <lastmod>2026-04-15T13:31:18.096Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.803Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/gemma3n.html</loc>
-    <lastmod>2026-04-15T13:31:18.099Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.805Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/devstral.html</loc>
-    <lastmod>2026-04-15T13:31:18.097Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.803Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/magistral/think.html</loc>
-    <lastmod>2026-04-15T13:31:18.095Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.802Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/qwen3-next.html</loc>
-    <lastmod>2026-04-15T13:31:18.098Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.804Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/training_stability.html</loc>
-    <lastmod>2026-04-15T13:27:31.208Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.746Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/expert_quantization.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/rlhf.html</loc>
-    <lastmod>2026-04-15T13:27:31.207Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.745Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/conversation.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/template_free.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/tokenized.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/gradient_checkpointing.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/checkpoint_saving.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html</loc>
-    <lastmod>2026-04-15T13:30:54.819Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.024Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html</loc>
-    <lastmod>2026-04-15T13:30:54.153Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.374Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.quantize.html</loc>
-    <lastmod>2026-04-15T13:30:54.360Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.576Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.lora.html</loc>
-    <lastmod>2026-04-15T13:30:55.104Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.model.html</loc>
-    <lastmod>2026-04-15T13:30:54.528Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.merge_lora.html</loc>
-    <lastmod>2026-04-15T13:30:54.331Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.547Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html</loc>
-    <lastmod>2026-04-15T13:30:54.482Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.696Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html</loc>
-    <lastmod>2026-04-15T13:30:55.036Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.236Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html</loc>
-    <lastmod>2026-04-15T13:30:54.729Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.937Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/convert.html</loc>
-    <lastmod>2026-04-15T13:30:54.023Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.249Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html</loc>
-    <lastmod>2026-04-15T13:30:55.323Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.514Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html</loc>
-    <lastmod>2026-04-15T13:30:54.154Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.376Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html</loc>
-    <lastmod>2026-04-15T13:30:54.769Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.976Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html</loc>
-    <lastmod>2026-04-15T13:30:54.514Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.727Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.quantize.html</loc>
-    <lastmod>2026-04-15T13:30:54.951Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.154Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.liger.args.html</loc>
-    <lastmod>2026-04-15T13:30:55.580Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.766Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.preprocess.html</loc>
-    <lastmod>2026-04-15T13:30:54.354Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.571Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html</loc>
-    <lastmod>2026-04-15T13:30:55.720Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
-    <lastmod>2026-04-15T13:30:55.709Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
-    <lastmod>2026-04-15T13:30:55.015Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
-    <lastmod>2026-04-15T13:30:54.856Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
-    <lastmod>2026-04-15T13:30:55.098Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
-    <lastmod>2026-04-15T13:30:54.802Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html</loc>
-    <lastmod>2026-04-15T13:30:55.087Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
-    <lastmod>2026-04-15T13:30:54.276Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
-    <lastmod>2026-04-15T13:30:55.382Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
-    <lastmod>2026-04-15T13:30:55.008Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.model.html</loc>
-    <lastmod>2026-04-15T13:30:55.290Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
-    <lastmod>2026-04-15T13:30:54.764Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
-    <lastmod>2026-04-15T13:30:54.861Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
-    <lastmod>2026-04-15T13:30:55.228Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
-    <lastmod>2026-04-15T13:30:55.115Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html</loc>
-    <lastmod>2026-04-15T13:30:55.728Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
-    <lastmod>2026-04-15T13:30:54.449Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
-    <lastmod>2026-04-15T13:30:54.805Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
-    <lastmod>2026-04-15T13:30:54.151Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
-    <lastmod>2026-04-15T13:30:55.584Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
-    <lastmod>2026-04-15T13:30:54.474Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
-    <lastmod>2026-04-15T13:30:53.987Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
-    <lastmod>2026-04-15T13:30:54.755Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
-    <lastmod>2026-04-15T13:30:54.320Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
-    <lastmod>2026-04-15T13:30:54.737Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.data.streaming.html</loc>
-    <lastmod>2026-04-15T13:30:55.230Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
-    <lastmod>2026-04-15T13:30:54.381Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
-    <lastmod>2026-04-15T13:30:55.647Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
-    <lastmod>2026-04-15T13:30:54.297Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
-    <lastmod>2026-04-15T13:30:54.966Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
-    <lastmod>2026-04-15T13:30:55.567Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
-    <lastmod>2026-04-15T13:30:54.467Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
-    <lastmod>2026-04-15T13:30:54.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.builders.rl.html</loc>
-    <lastmod>2026-04-15T13:30:54.106Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.trl.html</loc>
-    <lastmod>2026-04-15T13:30:55.338Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.batching.html</loc>
-    <lastmod>2026-04-15T13:30:55.636Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
-    <lastmod>2026-04-15T13:30:55.111Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
-    <lastmod>2026-04-15T13:30:55.576Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
-    <lastmod>2026-04-15T13:30:55.334Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
-    <lastmod>2026-04-15T13:30:54.618Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
-    <lastmod>2026-04-15T13:30:54.369Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.quantization.html</loc>
-    <lastmod>2026-04-15T13:30:55.262Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
-    <lastmod>2026-04-15T13:30:55.641Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
-    <lastmod>2026-04-15T13:30:54.925Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
-    <lastmod>2026-04-15T13:30:54.515Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
-    <lastmod>2026-04-15T13:30:54.960Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/common.architectures.html</loc>
-    <lastmod>2026-04-15T13:30:55.590Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
-    <lastmod>2026-04-15T13:30:54.344Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
-    <lastmod>2026-04-15T13:30:54.722Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.adapter.html</loc>
-    <lastmod>2026-04-15T13:30:54.547Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.fetch.html</loc>
-    <lastmod>2026-04-15T13:30:54.403Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
-    <lastmod>2026-04-15T13:30:55.024Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
-    <lastmod>2026-04-15T13:30:54.219Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
-    <lastmod>2026-04-15T13:30:55.187Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
-    <lastmod>2026-04-15T13:30:55.696Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
-    <lastmod>2026-04-15T13:30:54.156Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
-    <lastmod>2026-04-15T13:30:55.344Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/multimodal.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/input_output.html</loc>
-    <lastmod>2026-04-15T13:27:31.205Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/lora_optims.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/telemetry.html</loc>
-    <lastmod>2026-04-15T13:27:31.208Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/batch_vs_grad.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/fsdp_qlora.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/nccl.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/vllm_serving.html</loc>
-    <lastmod>2026-04-15T13:27:31.208Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/attention.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/multipack.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
-    <lastmod>2026-04-15T13:27:31.208Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/nd_parallelism.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/mac.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/reward_modelling.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/agents/model_architectures.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/agents/grpo.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/agents/pretraining.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/agents/new_model_support.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/optimizers.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html</loc>
-    <lastmod>2026-04-15T13:27:31.212Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html</loc>
-    <lastmod>2026-04-15T13:27:31.236Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
-    <lastmod>2026-04-15T13:27:31.237Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/installation.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/ebft.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/agents/reward_modelling.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/agents/sft.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/agents/preference_tuning.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/mixed_precision.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/docker.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/grpo.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/streaming.html</loc>
-    <lastmod>2026-04-15T13:27:31.208Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/choosing_method.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/sequence_parallelism.html</loc>
-    <lastmod>2026-04-15T13:27:31.208Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/optimizations.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/multi-node.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/quantize.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/inference.html</loc>
-    <lastmod>2026-04-15T13:27:31.205Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/ray-integration.html</loc>
-    <lastmod>2026-04-15T13:27:31.206Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/amd_hpc.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/unsloth.html</loc>
-    <lastmod>2026-04-15T13:27:31.208Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
-    <lastmod>2026-04-15T13:30:55.375Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.builders.base.html</loc>
-    <lastmod>2026-04-15T13:30:54.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
-    <lastmod>2026-04-15T13:30:54.087Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
-    <lastmod>2026-04-15T13:30:54.681Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
-    <lastmod>2026-04-15T13:30:55.715Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
-    <lastmod>2026-04-15T13:30:54.383Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.patch_manager.html</loc>
-    <lastmod>2026-04-15T13:30:54.568Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
-    <lastmod>2026-04-15T13:30:54.163Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
-    <lastmod>2026-04-15T13:30:54.590Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
-    <lastmod>2026-04-15T13:30:55.611Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
-    <lastmod>2026-04-15T13:30:54.829Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
-    <lastmod>2026-04-15T13:30:54.706Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
-    <lastmod>2026-04-15T13:30:54.663Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
-    <lastmod>2026-04-15T13:30:54.953Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
-    <lastmod>2026-04-15T13:30:54.964Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
-    <lastmod>2026-04-15T13:30:55.281Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
-    <lastmod>2026-04-15T13:30:55.219Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.constants.html</loc>
-    <lastmod>2026-04-15T13:30:54.570Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
-    <lastmod>2026-04-15T13:30:55.048Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
-    <lastmod>2026-04-15T13:30:54.912Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
-    <lastmod>2026-04-15T13:30:54.007Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
-    <lastmod>2026-04-15T13:30:55.592Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
-    <lastmod>2026-04-15T13:30:54.500Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
-    <lastmod>2026-04-15T13:30:54.936Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.load.html</loc>
-    <lastmod>2026-04-15T13:30:54.410Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
-    <lastmod>2026-04-15T13:30:54.809Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html</loc>
-    <lastmod>2026-04-15T13:30:54.303Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html</loc>
-    <lastmod>2026-04-15T13:30:54.742Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
-    <lastmod>2026-04-15T13:30:55.127Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
-    <lastmod>2026-04-15T13:30:54.229Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
-    <lastmod>2026-04-15T13:30:55.612Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
-    <lastmod>2026-04-15T13:30:54.696Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
-    <lastmod>2026-04-15T13:30:54.831Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
-    <lastmod>2026-04-15T13:30:55.561Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
-    <lastmod>2026-04-15T13:30:55.704Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.tokenizer.html</loc>
-    <lastmod>2026-04-15T13:30:54.538Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html</loc>
-    <lastmod>2026-04-15T13:30:54.172Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
-    <lastmod>2026-04-15T13:30:54.777Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.processor.html</loc>
-    <lastmod>2026-04-15T13:30:54.540Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
-    <lastmod>2026-04-15T13:30:55.096Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
-    <lastmod>2026-04-15T13:30:55.028Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
-    <lastmod>2026-04-15T13:30:55.238Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
-    <lastmod>2026-04-15T13:30:54.807Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
-    <lastmod>2026-04-15T13:30:55.589Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
-    <lastmod>2026-04-15T13:30:55.565Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
-    <lastmod>2026-04-15T13:30:55.051Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
-    <lastmod>2026-04-15T13:30:54.790Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html</loc>
-    <lastmod>2026-04-15T13:30:55.055Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
-    <lastmod>2026-04-15T13:30:54.962Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
-    <lastmod>2026-04-15T13:30:55.149Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
-    <lastmod>2026-04-15T13:30:55.017Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.builders.causal.html</loc>
-    <lastmod>2026-04-15T13:30:54.100Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
-    <lastmod>2026-04-15T13:30:54.373Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html</loc>
-    <lastmod>2026-04-15T13:30:54.581Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
-    <lastmod>2026-04-15T13:30:53.912Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.902Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.base.html</loc>
-    <lastmod>2026-04-15T13:30:54.620Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.831Z</lastmod>
   </url>
   <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html</loc>
-    <lastmod>2026-04-15T13:30:55.038Z</lastmod>
+    <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
+    <lastmod>2026-04-21T14:20:08.140Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html</loc>
+    <lastmod>2026-04-21T14:20:08.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
+    <lastmod>2026-04-21T14:20:08.589Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.builders.causal.html</loc>
+    <lastmod>2026-04-21T14:20:08.324Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
+    <lastmod>2026-04-21T14:20:09.218Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
+    <lastmod>2026-04-21T14:20:09.345Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
+    <lastmod>2026-04-21T14:20:09.164Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html</loc>
+    <lastmod>2026-04-21T14:20:09.253Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
+    <lastmod>2026-04-21T14:20:08.996Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
+    <lastmod>2026-04-21T14:20:09.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
+    <lastmod>2026-04-21T14:20:09.751Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
+    <lastmod>2026-04-21T14:20:09.774Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
+    <lastmod>2026-04-21T14:20:09.013Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
+    <lastmod>2026-04-21T14:20:09.432Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
+    <lastmod>2026-04-21T14:20:09.229Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
+    <lastmod>2026-04-21T14:20:09.294Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.processor.html</loc>
+    <lastmod>2026-04-21T14:20:08.753Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
+    <lastmod>2026-04-21T14:20:08.984Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html</loc>
+    <lastmod>2026-04-21T14:20:08.393Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.tokenizer.html</loc>
+    <lastmod>2026-04-21T14:20:08.751Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
+    <lastmod>2026-04-21T14:20:09.887Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
+    <lastmod>2026-04-21T14:20:09.747Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
+    <lastmod>2026-04-21T14:20:09.036Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
+    <lastmod>2026-04-21T14:20:08.905Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
+    <lastmod>2026-04-21T14:20:09.797Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
+    <lastmod>2026-04-21T14:20:08.448Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
+    <lastmod>2026-04-21T14:20:09.325Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html</loc>
+    <lastmod>2026-04-21T14:20:08.950Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html</loc>
+    <lastmod>2026-04-21T14:20:08.520Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
+    <lastmod>2026-04-21T14:20:09.014Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.load.html</loc>
+    <lastmod>2026-04-21T14:20:08.625Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
+    <lastmod>2026-04-21T14:20:09.139Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
+    <lastmod>2026-04-21T14:20:08.713Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
+    <lastmod>2026-04-21T14:20:09.777Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
+    <lastmod>2026-04-21T14:20:08.234Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
+    <lastmod>2026-04-21T14:20:09.114Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
+    <lastmod>2026-04-21T14:20:09.248Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.constants.html</loc>
+    <lastmod>2026-04-21T14:20:08.782Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
+    <lastmod>2026-04-21T14:20:09.413Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
+    <lastmod>2026-04-21T14:20:09.474Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
+    <lastmod>2026-04-21T14:20:09.166Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
+    <lastmod>2026-04-21T14:20:09.155Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
+    <lastmod>2026-04-21T14:20:08.873Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
+    <lastmod>2026-04-21T14:20:08.915Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
+    <lastmod>2026-04-21T14:20:09.034Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
+    <lastmod>2026-04-21T14:20:09.795Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
+    <lastmod>2026-04-21T14:20:08.801Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
+    <lastmod>2026-04-21T14:20:08.384Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.patch_manager.html</loc>
+    <lastmod>2026-04-21T14:20:08.780Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
+    <lastmod>2026-04-21T14:20:08.598Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
+    <lastmod>2026-04-21T14:20:09.897Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
+    <lastmod>2026-04-21T14:20:08.890Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
+    <lastmod>2026-04-21T14:20:08.311Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.builders.base.html</loc>
+    <lastmod>2026-04-21T14:20:08.319Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
+    <lastmod>2026-04-21T14:20:09.565Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/lora_optims.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/telemetry.html</loc>
+    <lastmod>2026-04-21T14:17:14.746Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/batch_vs_grad.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/fsdp_qlora.html</loc>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/nccl.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/vllm_serving.html</loc>
+    <lastmod>2026-04-21T14:17:14.746Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/attention.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multipack.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
+    <lastmod>2026-04-21T14:17:14.746Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/nd_parallelism.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/mac.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/reward_modelling.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/agents/model_architectures.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/agents/grpo.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/agents/pretraining.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/agents/new_model_support.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/optimizers.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html</loc>
+    <lastmod>2026-04-21T14:17:14.750Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html</loc>
+    <lastmod>2026-04-21T14:17:14.779Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
+    <lastmod>2026-04-21T14:17:14.781Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/installation.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/ebft.html</loc>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/agents/reward_modelling.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/agents/sft.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/agents/preference_tuning.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/mixed_precision.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/docker.html</loc>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/grpo.html</loc>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/streaming.html</loc>
+    <lastmod>2026-04-21T14:17:14.746Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/choosing_method.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/sequence_parallelism.html</loc>
+    <lastmod>2026-04-21T14:17:14.746Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/optimizations.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multi-node.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/quantize.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/inference.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/ray-integration.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/amd_hpc.html</loc>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/input_output.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multimodal.html</loc>
+    <lastmod>2026-04-21T14:17:14.744Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
+    <lastmod>2026-04-21T14:20:09.535Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
+    <lastmod>2026-04-21T14:20:08.378Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
+    <lastmod>2026-04-21T14:20:09.879Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
+    <lastmod>2026-04-21T14:20:09.382Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
+    <lastmod>2026-04-21T14:20:08.438Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
+    <lastmod>2026-04-21T14:20:09.225Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.fetch.html</loc>
+    <lastmod>2026-04-21T14:20:08.618Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.adapter.html</loc>
+    <lastmod>2026-04-21T14:20:08.759Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
+    <lastmod>2026-04-21T14:20:08.930Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
+    <lastmod>2026-04-21T14:20:08.561Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/common.architectures.html</loc>
+    <lastmod>2026-04-21T14:20:09.776Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
+    <lastmod>2026-04-21T14:20:09.162Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
+    <lastmod>2026-04-21T14:20:08.729Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
+    <lastmod>2026-04-21T14:20:09.127Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
+    <lastmod>2026-04-21T14:20:09.825Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.quantization.html</loc>
+    <lastmod>2026-04-21T14:20:09.455Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
+    <lastmod>2026-04-21T14:20:08.585Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
+    <lastmod>2026-04-21T14:20:08.829Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
+    <lastmod>2026-04-21T14:20:09.525Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
+    <lastmod>2026-04-21T14:20:09.762Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
+    <lastmod>2026-04-21T14:20:09.308Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.batching.html</loc>
+    <lastmod>2026-04-21T14:20:09.820Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.trl.html</loc>
+    <lastmod>2026-04-21T14:20:09.529Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.builders.rl.html</loc>
+    <lastmod>2026-04-21T14:20:08.329Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
+    <lastmod>2026-04-21T14:20:08.889Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
+    <lastmod>2026-04-21T14:20:08.681Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
+    <lastmod>2026-04-21T14:20:09.752Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
+    <lastmod>2026-04-21T14:20:09.168Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
+    <lastmod>2026-04-21T14:20:08.515Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
+    <lastmod>2026-04-21T14:20:09.830Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
+    <lastmod>2026-04-21T14:20:08.597Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.data.streaming.html</loc>
+    <lastmod>2026-04-21T14:20:09.424Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
+    <lastmod>2026-04-21T14:20:08.945Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
+    <lastmod>2026-04-21T14:20:08.537Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
+    <lastmod>2026-04-21T14:20:08.963Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
+    <lastmod>2026-04-21T14:20:08.214Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
+    <lastmod>2026-04-21T14:20:08.687Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
+    <lastmod>2026-04-21T14:20:09.770Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
+    <lastmod>2026-04-21T14:20:08.373Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
+    <lastmod>2026-04-21T14:20:09.011Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
+    <lastmod>2026-04-21T14:20:08.663Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html</loc>
+    <lastmod>2026-04-21T14:20:09.910Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
+    <lastmod>2026-04-21T14:20:09.313Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
+    <lastmod>2026-04-21T14:20:09.423Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
+    <lastmod>2026-04-21T14:20:09.065Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
+    <lastmod>2026-04-21T14:20:08.971Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.model.html</loc>
+    <lastmod>2026-04-21T14:20:09.483Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
+    <lastmod>2026-04-21T14:20:09.209Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
+    <lastmod>2026-04-21T14:20:09.571Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
+    <lastmod>2026-04-21T14:20:08.494Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html</loc>
+    <lastmod>2026-04-21T14:20:09.285Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
+    <lastmod>2026-04-21T14:20:09.008Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
+    <lastmod>2026-04-21T14:20:09.296Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
+    <lastmod>2026-04-21T14:20:09.060Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
+    <lastmod>2026-04-21T14:20:09.216Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
+    <lastmod>2026-04-21T14:20:09.891Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html</loc>
-    <lastmod>2026-04-15T13:30:55.364Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.554Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.training_args.html</loc>
-    <lastmod>2026-04-15T13:30:54.122Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.345Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.sweeps.html</loc>
-    <lastmod>2026-04-15T13:30:54.417Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.632Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.art.html</loc>
-    <lastmod>2026-04-15T13:30:54.268Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.486Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.relora.html</loc>
-    <lastmod>2026-04-15T13:30:54.971Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.171Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_tokenizers.html</loc>
-    <lastmod>2026-04-15T13:30:54.075Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.300Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.args.html</loc>
-    <lastmod>2026-04-15T13:30:54.264Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.482Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.core.html</loc>
-    <lastmod>2026-04-15T13:30:55.614Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.798Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.distributed.html</loc>
-    <lastmod>2026-04-15T13:30:55.211Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.407Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.evaluate.html</loc>
-    <lastmod>2026-04-15T13:30:54.239Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.458Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/evaluate.html</loc>
-    <lastmod>2026-04-15T13:30:53.999Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.226Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html</loc>
-    <lastmod>2026-04-15T13:30:54.751Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.958Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html</loc>
-    <lastmod>2026-04-15T13:30:55.711Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.893Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.args.html</loc>
-    <lastmod>2026-04-15T13:30:54.396Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.612Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.training.html</loc>
-    <lastmod>2026-04-15T13:30:55.298Z</lastmod>
+    <lastmod>2026-04-21T14:20:09.491Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html</loc>
-    <lastmod>2026-04-15T13:30:54.577Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.789Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.train.html</loc>
-    <lastmod>2026-04-15T13:30:54.431Z</lastmod>
+    <lastmod>2026-04-21T14:20:08.646Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/faq.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset_preprocessing.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/pretraining.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/index.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/cli.html</loc>
-    <lastmod>2026-04-15T13:27:31.202Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.740Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/config-reference.html</loc>
-    <lastmod>2026-04-15T13:31:17.093Z</lastmod>
+    <lastmod>2026-04-21T14:20:31.556Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/debugging.html</loc>
-    <lastmod>2026-04-15T13:27:31.203Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.741Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/magistral/vision.html</loc>
-    <lastmod>2026-04-15T13:31:18.096Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.802Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/trinity.html</loc>
-    <lastmod>2026-04-15T13:31:18.092Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/gpt-oss.html</loc>
-    <lastmod>2026-04-15T13:31:18.100Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.806Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/LiquidAI.html</loc>
-    <lastmod>2026-04-15T13:31:18.101Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.807Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/granite4.html</loc>
-    <lastmod>2026-04-15T13:31:18.101Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.807Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/voxtral.html</loc>
-    <lastmod>2026-04-15T13:31:18.097Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.803Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/mistral-small.html</loc>
-    <lastmod>2026-04-15T13:31:18.096Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.803Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/llama-4.html</loc>
-    <lastmod>2026-04-15T13:31:18.098Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.804Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/llama-2.html</loc>
-    <lastmod>2026-04-15T13:31:18.098Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.804Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/jamba.html</loc>
-    <lastmod>2026-04-15T13:31:18.102Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.808Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/ministral3/think.html</loc>
-    <lastmod>2026-04-15T13:31:18.094Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/orpheus.html</loc>
-    <lastmod>2026-04-15T13:31:18.102Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.808Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/index.html</loc>
-    <lastmod>2026-04-15T13:31:18.103Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.808Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/olmo3.html</loc>
-    <lastmod>2026-04-15T13:31:18.092Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.799Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/magistral.html</loc>
-    <lastmod>2026-04-15T13:31:18.095Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.802Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/ministral3.html</loc>
-    <lastmod>2026-04-15T13:31:18.094Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/models/plano.html</loc>
-    <lastmod>2026-04-15T13:31:18.091Z</lastmod>
+    <lastmod>2026-04-21T14:20:32.798Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/index.html</loc>
-    <lastmod>2026-04-15T13:27:31.229Z</lastmod>
+    <lastmod>2026-04-21T14:17:14.770Z</lastmod>
   </url>
 </urlset>
diff --git a/src/axolotl/integrations/LICENSE.html b/src/axolotl/integrations/LICENSE.html
index 2079942fa..bc366f03f 100644
--- a/src/axolotl/integrations/LICENSE.html
+++ b/src/axolotl/integrations/LICENSE.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
index abcccc6dc..52b26599a 100644
--- a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
+++ b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
@@ -661,12 +661,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">FSDP + QLoRA</span></a>
   </div>
-</li>
-          <li class="sidebar-item">
-  <div class="sidebar-item-container"> 
-  <a href="../../../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
- <span class="menu-text">Unsloth</span></a>
-  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container">