From eac3a4860e5dbf7d21625964ba84c644d2b3fa87 Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Thu, 12 Jun 2025 17:25:50 +0000 Subject: [PATCH] Built site for gh-pages --- .github/workflows/main.yml | 8 +- .nojekyll | 2 +- docs/api/core.training_args.html | 513 +++++++++++++------------ docs/api/utils.samplers.multipack.html | 2 +- docs/qat.html | 2 +- index.html | 250 ++---------- search.json | 48 +-- sitemap.xml | 378 +++++++++--------- 8 files changed, 501 insertions(+), 702 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 01606f902..7ff712757 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -29,12 +29,12 @@ jobs: - cuda: 126 cuda_version: 12.6.3 python_version: "3.11" - pytorch: 2.7.0 + pytorch: 2.7.1 axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" - pytorch: 2.7.0 + pytorch: 2.7.1 axolotl_extras: runs-on: axolotl-gpu-runner steps: @@ -97,12 +97,12 @@ jobs: - cuda: 126 cuda_version: 12.6.3 python_version: "3.11" - pytorch: 2.7.0 + pytorch: 2.7.1 axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" - pytorch: 2.7.0 + pytorch: 2.7.1 axolotl_extras: runs-on: axolotl-gpu-runner steps: diff --git a/.nojekyll b/.nojekyll index 16a82cc64..81c8edb89 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -91cd29d0 \ No newline at end of file +72083c85 \ No newline at end of file diff --git a/docs/api/core.training_args.html b/docs/api/core.training_args.html index a74183f9e..f35ea760a 100644 --- a/docs/api/core.training_args.html +++ b/docs/api/core.training_args.html @@ -547,43 +547,44 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); sample_packing_bin_size=200, sample_packing_group_size=100000, max_seq_length=2048, - relora_steps=None, - relora_warmup_steps=None, - relora_anneal_steps=None, - relora_prune_ratio=0.9, - bench_split='eval', - bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', - do_bench_eval=False, - do_causal_lm_eval=False, - max_bench_samples=None, - bench_source_max_len=2048, - dataloader_prefetch_factor=None, - cosine_min_lr_ratio=None, - cosine_constant_lr_ratio=None, - loraplus_lr_ratio=None, - loraplus_lr_embedding=1e-06, - embedding_lr_scale=None, - lr_groups=None, - embedding_lr=None, - qlora=False, - orpo_alpha=None, - lisa_n_layers=None, - lisa_step_interval=None, - lisa_layers_attribute=None, - curriculum_sampling=None, - alternate_lr_scheduler_type=None, - chat_template=None, - kd_ce_alpha=None, - kd_alpha=1.0, - kd_temperature=1.0, - kd_zscore_base_temp=None, - kd_top_k_before_softmax=None, - adam_beta3=None, - adam_epsilon2=None, - image_size=None, - image_resize_algorithm=None, - simpo_gamma=None, -) + dataset_num_proc=None, + relora_steps=None, + relora_warmup_steps=None, + relora_anneal_steps=None, + relora_prune_ratio=0.9, + bench_split='eval', + bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', + do_bench_eval=False, + do_causal_lm_eval=False, + max_bench_samples=None, + bench_source_max_len=2048, + dataloader_prefetch_factor=None, + cosine_min_lr_ratio=None, + cosine_constant_lr_ratio=None, + loraplus_lr_ratio=None, + loraplus_lr_embedding=1e-06, + embedding_lr_scale=None, + lr_groups=None, + embedding_lr=None, + qlora=False, + orpo_alpha=None, + lisa_n_layers=None, + lisa_step_interval=None, + lisa_layers_attribute=None, + curriculum_sampling=None, + alternate_lr_scheduler_type=None, + chat_template=None, + kd_ce_alpha=None, + kd_alpha=1.0, + kd_temperature=1.0, + kd_zscore_base_temp=None, + kd_top_k_before_softmax=None, + adam_beta3=None, + adam_epsilon2=None, + image_size=None, + image_resize_algorithm=None, + simpo_gamma=None, +)

CPO config for CPO training

@@ -600,42 +601,43 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); sample_packing_bin_size=200, sample_packing_group_size=100000, max_seq_length=2048, - relora_steps=None, - relora_warmup_steps=None, - relora_anneal_steps=None, - relora_prune_ratio=0.9, - bench_split='eval', - bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', - do_bench_eval=False, - do_causal_lm_eval=False, - max_bench_samples=None, - bench_source_max_len=2048, - dataloader_prefetch_factor=None, - cosine_min_lr_ratio=None, - cosine_constant_lr_ratio=None, - loraplus_lr_ratio=None, - loraplus_lr_embedding=1e-06, - embedding_lr_scale=None, - lr_groups=None, - embedding_lr=None, - qlora=False, - orpo_alpha=None, - lisa_n_layers=None, - lisa_step_interval=None, - lisa_layers_attribute=None, - curriculum_sampling=None, - alternate_lr_scheduler_type=None, - chat_template=None, - kd_ce_alpha=None, - kd_alpha=1.0, - kd_temperature=1.0, - kd_zscore_base_temp=None, - kd_top_k_before_softmax=None, - adam_beta3=None, - adam_epsilon2=None, - image_size=None, - image_resize_algorithm=None, -) + dataset_num_proc=None, + relora_steps=None, + relora_warmup_steps=None, + relora_anneal_steps=None, + relora_prune_ratio=0.9, + bench_split='eval', + bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', + do_bench_eval=False, + do_causal_lm_eval=False, + max_bench_samples=None, + bench_source_max_len=2048, + dataloader_prefetch_factor=None, + cosine_min_lr_ratio=None, + cosine_constant_lr_ratio=None, + loraplus_lr_ratio=None, + loraplus_lr_embedding=1e-06, + embedding_lr_scale=None, + lr_groups=None, + embedding_lr=None, + qlora=False, + orpo_alpha=None, + lisa_n_layers=None, + lisa_step_interval=None, + lisa_layers_attribute=None, + curriculum_sampling=None, + alternate_lr_scheduler_type=None, + chat_template=None, + kd_ce_alpha=None, + kd_alpha=1.0, + kd_temperature=1.0, + kd_zscore_base_temp=None, + kd_top_k_before_softmax=None, + adam_beta3=None, + adam_epsilon2=None, + image_size=None, + image_resize_algorithm=None, +)

KTO config for KTO training

@@ -652,42 +654,43 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); sample_packing_bin_size=200, sample_packing_group_size=100000, max_seq_length=2048, - relora_steps=None, - relora_warmup_steps=None, - relora_anneal_steps=None, - relora_prune_ratio=0.9, - bench_split='eval', - bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', - do_bench_eval=False, - do_causal_lm_eval=False, - max_bench_samples=None, - bench_source_max_len=2048, - dataloader_prefetch_factor=None, - cosine_min_lr_ratio=None, - cosine_constant_lr_ratio=None, - loraplus_lr_ratio=None, - loraplus_lr_embedding=1e-06, - embedding_lr_scale=None, - lr_groups=None, - embedding_lr=None, - qlora=False, - orpo_alpha=None, - lisa_n_layers=None, - lisa_step_interval=None, - lisa_layers_attribute=None, - curriculum_sampling=None, - alternate_lr_scheduler_type=None, - chat_template=None, - kd_ce_alpha=None, - kd_alpha=1.0, - kd_temperature=1.0, - kd_zscore_base_temp=None, - kd_top_k_before_softmax=None, - adam_beta3=None, - adam_epsilon2=None, - image_size=None, - image_resize_algorithm=None, -) + dataset_num_proc=None, + relora_steps=None, + relora_warmup_steps=None, + relora_anneal_steps=None, + relora_prune_ratio=0.9, + bench_split='eval', + bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', + do_bench_eval=False, + do_causal_lm_eval=False, + max_bench_samples=None, + bench_source_max_len=2048, + dataloader_prefetch_factor=None, + cosine_min_lr_ratio=None, + cosine_constant_lr_ratio=None, + loraplus_lr_ratio=None, + loraplus_lr_embedding=1e-06, + embedding_lr_scale=None, + lr_groups=None, + embedding_lr=None, + qlora=False, + orpo_alpha=None, + lisa_n_layers=None, + lisa_step_interval=None, + lisa_layers_attribute=None, + curriculum_sampling=None, + alternate_lr_scheduler_type=None, + chat_template=None, + kd_ce_alpha=None, + kd_alpha=1.0, + kd_temperature=1.0, + kd_zscore_base_temp=None, + kd_top_k_before_softmax=None, + adam_beta3=None, + adam_epsilon2=None, + image_size=None, + image_resize_algorithm=None, +)

ORPO config for ORPO training

@@ -704,42 +707,43 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); sample_packing_bin_size=200, sample_packing_group_size=100000, max_seq_length=2048, - relora_steps=None, - relora_warmup_steps=None, - relora_anneal_steps=None, - relora_prune_ratio=0.9, - bench_split='eval', - bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', - do_bench_eval=False, - do_causal_lm_eval=False, - max_bench_samples=None, - bench_source_max_len=2048, - dataloader_prefetch_factor=None, - cosine_min_lr_ratio=None, - cosine_constant_lr_ratio=None, - loraplus_lr_ratio=None, - loraplus_lr_embedding=1e-06, - embedding_lr_scale=None, - lr_groups=None, - embedding_lr=None, - qlora=False, - orpo_alpha=None, - lisa_n_layers=None, - lisa_step_interval=None, - lisa_layers_attribute=None, - curriculum_sampling=None, - alternate_lr_scheduler_type=None, - chat_template=None, - kd_ce_alpha=None, - kd_alpha=1.0, - kd_temperature=1.0, - kd_zscore_base_temp=None, - kd_top_k_before_softmax=None, - adam_beta3=None, - adam_epsilon2=None, - image_size=None, - image_resize_algorithm=None, -) + dataset_num_proc=None, + relora_steps=None, + relora_warmup_steps=None, + relora_anneal_steps=None, + relora_prune_ratio=0.9, + bench_split='eval', + bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', + do_bench_eval=False, + do_causal_lm_eval=False, + max_bench_samples=None, + bench_source_max_len=2048, + dataloader_prefetch_factor=None, + cosine_min_lr_ratio=None, + cosine_constant_lr_ratio=None, + loraplus_lr_ratio=None, + loraplus_lr_embedding=1e-06, + embedding_lr_scale=None, + lr_groups=None, + embedding_lr=None, + qlora=False, + orpo_alpha=None, + lisa_n_layers=None, + lisa_step_interval=None, + lisa_layers_attribute=None, + curriculum_sampling=None, + alternate_lr_scheduler_type=None, + chat_template=None, + kd_ce_alpha=None, + kd_alpha=1.0, + kd_temperature=1.0, + kd_zscore_base_temp=None, + kd_top_k_before_softmax=None, + adam_beta3=None, + adam_epsilon2=None, + image_size=None, + image_resize_algorithm=None, +)

PRM config for PRM training

@@ -756,42 +760,43 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); sample_packing_bin_size=200, sample_packing_group_size=100000, max_seq_length=2048, - relora_steps=None, - relora_warmup_steps=None, - relora_anneal_steps=None, - relora_prune_ratio=0.9, - bench_split='eval', - bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', - do_bench_eval=False, - do_causal_lm_eval=False, - max_bench_samples=None, - bench_source_max_len=2048, - dataloader_prefetch_factor=None, - cosine_min_lr_ratio=None, - cosine_constant_lr_ratio=None, - loraplus_lr_ratio=None, - loraplus_lr_embedding=1e-06, - embedding_lr_scale=None, - lr_groups=None, - embedding_lr=None, - qlora=False, - orpo_alpha=None, - lisa_n_layers=None, - lisa_step_interval=None, - lisa_layers_attribute=None, - curriculum_sampling=None, - alternate_lr_scheduler_type=None, - chat_template=None, - kd_ce_alpha=None, - kd_alpha=1.0, - kd_temperature=1.0, - kd_zscore_base_temp=None, - kd_top_k_before_softmax=None, - adam_beta3=None, - adam_epsilon2=None, - image_size=None, - image_resize_algorithm=None, -) + dataset_num_proc=None, + relora_steps=None, + relora_warmup_steps=None, + relora_anneal_steps=None, + relora_prune_ratio=0.9, + bench_split='eval', + bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', + do_bench_eval=False, + do_causal_lm_eval=False, + max_bench_samples=None, + bench_source_max_len=2048, + dataloader_prefetch_factor=None, + cosine_min_lr_ratio=None, + cosine_constant_lr_ratio=None, + loraplus_lr_ratio=None, + loraplus_lr_embedding=1e-06, + embedding_lr_scale=None, + lr_groups=None, + embedding_lr=None, + qlora=False, + orpo_alpha=None, + lisa_n_layers=None, + lisa_step_interval=None, + lisa_layers_attribute=None, + curriculum_sampling=None, + alternate_lr_scheduler_type=None, + chat_template=None, + kd_ce_alpha=None, + kd_alpha=1.0, + kd_temperature=1.0, + kd_zscore_base_temp=None, + kd_top_k_before_softmax=None, + adam_beta3=None, + adam_epsilon2=None, + image_size=None, + image_resize_algorithm=None, +)

Reward config for Reward training

@@ -808,42 +813,43 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); sample_packing_bin_size=200, sample_packing_group_size=100000, max_seq_length=2048, - relora_steps=None, - relora_warmup_steps=None, - relora_anneal_steps=None, - relora_prune_ratio=0.9, - bench_split='eval', - bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', - do_bench_eval=False, - do_causal_lm_eval=False, - max_bench_samples=None, - bench_source_max_len=2048, - dataloader_prefetch_factor=None, - cosine_min_lr_ratio=None, - cosine_constant_lr_ratio=None, - loraplus_lr_ratio=None, - loraplus_lr_embedding=1e-06, - embedding_lr_scale=None, - lr_groups=None, - embedding_lr=None, - qlora=False, - orpo_alpha=None, - lisa_n_layers=None, - lisa_step_interval=None, - lisa_layers_attribute=None, - curriculum_sampling=None, - alternate_lr_scheduler_type=None, - chat_template=None, - kd_ce_alpha=None, - kd_alpha=1.0, - kd_temperature=1.0, - kd_zscore_base_temp=None, - kd_top_k_before_softmax=None, - adam_beta3=None, - adam_epsilon2=None, - image_size=None, - image_resize_algorithm=None, -) + dataset_num_proc=None, + relora_steps=None, + relora_warmup_steps=None, + relora_anneal_steps=None, + relora_prune_ratio=0.9, + bench_split='eval', + bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', + do_bench_eval=False, + do_causal_lm_eval=False, + max_bench_samples=None, + bench_source_max_len=2048, + dataloader_prefetch_factor=None, + cosine_min_lr_ratio=None, + cosine_constant_lr_ratio=None, + loraplus_lr_ratio=None, + loraplus_lr_embedding=1e-06, + embedding_lr_scale=None, + lr_groups=None, + embedding_lr=None, + qlora=False, + orpo_alpha=None, + lisa_n_layers=None, + lisa_step_interval=None, + lisa_layers_attribute=None, + curriculum_sampling=None, + alternate_lr_scheduler_type=None, + chat_template=None, + kd_ce_alpha=None, + kd_alpha=1.0, + kd_temperature=1.0, + kd_zscore_base_temp=None, + kd_top_k_before_softmax=None, + adam_beta3=None, + adam_epsilon2=None, + image_size=None, + image_resize_algorithm=None, +)

Training arguments for Causal trainer

This code is duplicated due to HF TrainingArguments not setting output_dir with a default value so it can’t be used as a mixin.

@@ -862,42 +868,43 @@ default value so it can’t be used as a mixin.

sample_packing_bin_size=200, sample_packing_group_size=100000, max_seq_length=2048, - relora_steps=None, - relora_warmup_steps=None, - relora_anneal_steps=None, - relora_prune_ratio=0.9, - bench_split='eval', - bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', - do_bench_eval=False, - do_causal_lm_eval=False, - max_bench_samples=None, - bench_source_max_len=2048, - dataloader_prefetch_factor=None, - cosine_min_lr_ratio=None, - cosine_constant_lr_ratio=None, - loraplus_lr_ratio=None, - loraplus_lr_embedding=1e-06, - embedding_lr_scale=None, - lr_groups=None, - embedding_lr=None, - qlora=False, - orpo_alpha=None, - lisa_n_layers=None, - lisa_step_interval=None, - lisa_layers_attribute=None, - curriculum_sampling=None, - alternate_lr_scheduler_type=None, - chat_template=None, - kd_ce_alpha=None, - kd_alpha=1.0, - kd_temperature=1.0, - kd_zscore_base_temp=None, - kd_top_k_before_softmax=None, - adam_beta3=None, - adam_epsilon2=None, - image_size=None, - image_resize_algorithm=None, -) + dataset_num_proc=None, + relora_steps=None, + relora_warmup_steps=None, + relora_anneal_steps=None, + relora_prune_ratio=0.9, + bench_split='eval', + bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', + do_bench_eval=False, + do_causal_lm_eval=False, + max_bench_samples=None, + bench_source_max_len=2048, + dataloader_prefetch_factor=None, + cosine_min_lr_ratio=None, + cosine_constant_lr_ratio=None, + loraplus_lr_ratio=None, + loraplus_lr_embedding=1e-06, + embedding_lr_scale=None, + lr_groups=None, + embedding_lr=None, + qlora=False, + orpo_alpha=None, + lisa_n_layers=None, + lisa_step_interval=None, + lisa_layers_attribute=None, + curriculum_sampling=None, + alternate_lr_scheduler_type=None, + chat_template=None, + kd_ce_alpha=None, + kd_alpha=1.0, + kd_temperature=1.0, + kd_zscore_base_temp=None, + kd_top_k_before_softmax=None, + adam_beta3=None, + adam_epsilon2=None, + image_size=None, + image_resize_algorithm=None, +)

Mixin class for the Axolotl training args.

diff --git a/docs/api/utils.samplers.multipack.html b/docs/api/utils.samplers.multipack.html index 6d351ba0f..75a23093a 100644 --- a/docs/api/utils.samplers.multipack.html +++ b/docs/api/utils.samplers.multipack.html @@ -520,7 +520,7 @@ into fixed-capacity batches to optimize memory usage and training throughput.

lengths, packing_efficiency_estimate=1.0, drop_last=False, - num_count_samples=16, + num_count_samples=8, sequential=False, group_size=100000, bin_size=200, diff --git a/docs/qat.html b/docs/qat.html index d94a67ed7..fff59d03e 100644 --- a/docs/qat.html +++ b/docs/qat.html @@ -512,7 +512,7 @@ and the QAT documentation in the weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8" group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after -

Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the quantize command to do this.

+

Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the quantize command to do this.

diff --git a/index.html b/index.html index abaab220a..d3bac7b26 100644 --- a/index.html +++ b/index.html @@ -467,16 +467,16 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});

On this page

@@ -510,27 +510,31 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); tests-nightly multigpu-semi-weekly tests

-

Axolotl is a tool designed to streamline post-training for various AI models. -Post-training refers to any modifications or additional training performed on -pre-trained models - including full model fine-tuning, parameter-efficient tuning (like -LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment -techniques. With support for multiple model architectures and training configurations, -Axolotl makes it easy to get started with these techniques.

-

Axolotl is designed to work with YAML config files that contain everything you need to -preprocess a dataset, train or fine-tune a model, run model inference or evaluation, -and much more.

+
+

🎉 Latest Updates

+ +
+
+

✨ Overview

+

Axolotl is a tool designed to streamline post-training for various AI models.

Features:

+

🚀 Quick Start

Requirements:

@@ -562,22 +566,12 @@ and much more.

That’s it! Check out our Getting Started Guide for a more detailed walkthrough.

-
-

✨ Key Features

- -

📚 Documentation

-
-

Supported Models

- ---------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
fp16/fp32loraqloragptqgptq w/flash attnflash attnxformers attn
llama
Mistral
Mixtral-MoE
Mixtral8X22
Pythia
cerebras
btlm
mpt
falcon
gpt-j
XGen
phi
RWKV
Qwen
Gemma
Jamba
-

✅: supported -❌: not supported -❓: untested

-

❤️ Sponsors

Thank you to our sponsors who help make Axolotl possible:

diff --git a/search.json b/search.json index 41e427623..53e98255f 100644 --- a/search.json +++ b/search.json @@ -644,14 +644,14 @@ "href": "docs/api/core.training_args.html", "title": "core.training_args", "section": "", - "text": "core.training_args\nextra axolotl specific training args\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args." + "text": "core.training_args\nextra axolotl specific training args\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args." }, { "objectID": "docs/api/core.training_args.html#classes", "href": "docs/api/core.training_args.html#classes", "title": "core.training_args", "section": "", - "text": "Name\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args." + "text": "Name\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args." }, { "objectID": "docs/api/prompt_strategies.user_defined.html", @@ -1127,14 +1127,14 @@ "href": "docs/api/utils.samplers.multipack.html", "title": "utils.samplers.multipack", "section": "", - "text": "utils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\ninto fixed-capacity batches to optimize memory usage and training throughput.\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n sampler,\n batch_size,\n batch_max_len,\n lengths,\n packing_efficiency_estimate=1.0,\n drop_last=False,\n num_count_samples=16,\n sequential=False,\n group_size=100000,\n bin_size=200,\n num_processes=None,\n safe_mode=True,\n **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n sequence_lengths,\n rank,\n bin_capacity,\n num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n sequence_lengths,\n group_offset,\n bin_capacity,\n max_bins,\n bin_size,\n safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n sequence_lengths,\n bin_capacity,\n group_size,\n bin_size,\n num_processes=None,\n safe_mode=True,\n mp_start_method='spawn',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'spawn'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it." + "text": "utils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\ninto fixed-capacity batches to optimize memory usage and training throughput.\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n sampler,\n batch_size,\n batch_max_len,\n lengths,\n packing_efficiency_estimate=1.0,\n drop_last=False,\n num_count_samples=8,\n sequential=False,\n group_size=100000,\n bin_size=200,\n num_processes=None,\n safe_mode=True,\n **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n sequence_lengths,\n rank,\n bin_capacity,\n num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n sequence_lengths,\n group_offset,\n bin_capacity,\n max_bins,\n bin_size,\n safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n sequence_lengths,\n bin_capacity,\n group_size,\n bin_size,\n num_processes=None,\n safe_mode=True,\n mp_start_method='spawn',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'spawn'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it." }, { "objectID": "docs/api/utils.samplers.multipack.html#classes", "href": "docs/api/utils.samplers.multipack.html#classes", "title": "utils.samplers.multipack", "section": "", - "text": "Name\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n sampler,\n batch_size,\n batch_max_len,\n lengths,\n packing_efficiency_estimate=1.0,\n drop_last=False,\n num_count_samples=16,\n sequential=False,\n group_size=100000,\n bin_size=200,\n num_processes=None,\n safe_mode=True,\n **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs" + "text": "Name\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n sampler,\n batch_size,\n batch_max_len,\n lengths,\n packing_efficiency_estimate=1.0,\n drop_last=False,\n num_count_samples=8,\n sequential=False,\n group_size=100000,\n bin_size=200,\n num_processes=None,\n safe_mode=True,\n **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs" }, { "objectID": "docs/api/utils.samplers.multipack.html#functions", @@ -1833,11 +1833,21 @@ ] }, { - "objectID": "index.html", - "href": "index.html", + "objectID": "index.html#latest-updates", + "href": "index.html#latest-updates", "title": "Axolotl", - "section": "", - "text": "Axolotl is a tool designed to streamline post-training for various AI models.\nPost-training refers to any modifications or additional training performed on\npre-trained models - including full model fine-tuning, parameter-efficient tuning (like\nLoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment\ntechniques. With support for multiple model architectures and training configurations,\nAxolotl makes it easy to get started with these techniques.\nAxolotl is designed to work with YAML config files that contain everything you need to\npreprocess a dataset, train or fine-tune a model, run model inference or evaluation,\nand much more.\nFeatures:", + "section": "🎉 Latest Updates", + "text": "🎉 Latest Updates\n\n2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the docs to learn more!\n2025/04: Llama 4 support has been added in Axolotl. See examples to start training your own Llama 4 models with Axolotl’s linearized version!\n2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the blog and docs to learn how to scale your context length when fine-tuning.\n2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the docs to fine-tune your own!\n2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the docs to give it a try.\n2025/02: Axolotl has added GRPO support. Dive into our blog and GRPO example and have some fun!\n2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See docs.", + "crumbs": [ + "Home" + ] + }, + { + "objectID": "index.html#overview", + "href": "index.html#overview", + "title": "Axolotl", + "section": "✨ Overview", + "text": "✨ Overview\nAxolotl is a tool designed to streamline post-training for various AI models.\nFeatures:\n\nMultiple Model Support: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.\nTraining Methods: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).\nEasy Configuration: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.\nPerformance Optimizations: Multipacking, Flash Attention, Xformers, Flex Attention, Liger Kernel, Cut Cross Entropy, Sequence Parallelism (SP), LoRA optimizations, Multi-GPU training (FSDP1, FSDP2, DeepSpeed), Multi-node training (Torchrun, Ray), and many more!\nFlexible Dataset Handling: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.\nCloud Ready: We ship Docker images and also PyPI packages for use on cloud platforms and local hardware.", "crumbs": [ "Home" ] @@ -1852,22 +1862,12 @@ "Home" ] }, - { - "objectID": "index.html#key-features", - "href": "index.html#key-features", - "title": "Axolotl", - "section": "✨ Key Features", - "text": "✨ Key Features\n\nMultiple Model Support: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more\nTraining Methods: Full fine-tuning, LoRA, QLoRA, and more\nEasy Configuration: Simple YAML files to control your training setup\nPerformance Optimizations: Flash Attention, xformers, multi-GPU training\nFlexible Dataset Handling: Use various formats and custom datasets\nCloud Ready: Run on cloud platforms or local hardware", - "crumbs": [ - "Home" - ] - }, { "objectID": "index.html#documentation", "href": "index.html#documentation", "title": "Axolotl", "section": "📚 Documentation", - "text": "📚 Documentation\n\nInstallation Options - Detailed setup instructions for different environments\nConfiguration Guide - Full configuration options and examples\nDataset Guide - Supported formats and how to use them\nMulti-GPU Training\nMulti-Node Training\nMultipacking\nAPI Reference - Auto-generated code documentation\nFAQ - Frequently asked questions", + "text": "📚 Documentation\n\nInstallation Options - Detailed setup instructions for different environments\nConfiguration Guide - Full configuration options and examples\nDataset Loading - Loading datasets from various sources\nDataset Guide - Supported formats and how to use them\nMulti-GPU Training\nMulti-Node Training\nMultipacking\nAPI Reference - Auto-generated code documentation\nFAQ - Frequently asked questions", "crumbs": [ "Home" ] @@ -1892,16 +1892,6 @@ "Home" ] }, - { - "objectID": "index.html#supported-models", - "href": "index.html#supported-models", - "title": "Axolotl", - "section": "Supported Models", - "text": "Supported Models\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nfp16/fp32\nlora\nqlora\ngptq\ngptq w/flash attn\nflash attn\nxformers attn\n\n\n\n\nllama\n✅\n✅\n✅\n✅\n✅\n✅\n✅\n\n\nMistral\n✅\n✅\n✅\n✅\n✅\n✅\n✅\n\n\nMixtral-MoE\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nMixtral8X22\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nPythia\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\ncerebras\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\nbtlm\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\nmpt\n✅\n❌\n❓\n❌\n❌\n❌\n❓\n\n\nfalcon\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\ngpt-j\n✅\n✅\n✅\n❌\n❌\n❓\n❓\n\n\nXGen\n✅\n❓\n✅\n❓\n❓\n❓\n✅\n\n\nphi\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nRWKV\n✅\n❓\n❓\n❓\n❓\n❓\n❓\n\n\nQwen\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nGemma\n✅\n✅\n✅\n❓\n❓\n✅\n❓\n\n\nJamba\n✅\n✅\n✅\n❓\n❓\n✅\n❓\n\n\n\n✅: supported\n❌: not supported\n❓: untested", - "crumbs": [ - "Home" - ] - }, { "objectID": "index.html#sponsors", "href": "index.html#sponsors", diff --git a/sitemap.xml b/sitemap.xml index 62452cce5..83835e8ff 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,758 +2,758 @@ https://docs.axolotl.ai/docs/unsloth.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/dataset-formats/conversation.html - 2025-06-11T21:11:16.394Z + 2025-06-12T17:23:38.997Z https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html - 2025-06-11T21:11:16.394Z + 2025-06-12T17:23:38.997Z https://docs.axolotl.ai/docs/dataset-formats/tokenized.html - 2025-06-11T21:11:16.394Z + 2025-06-12T17:23:38.998Z https://docs.axolotl.ai/docs/mac.html - 2025-06-11T21:11:16.398Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/nccl.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/multi-node.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/docker.html - 2025-06-11T21:11:16.394Z + 2025-06-12T17:23:38.998Z https://docs.axolotl.ai/docs/lr_groups.html - 2025-06-11T21:11:16.398Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/inference.html - 2025-06-11T21:11:16.398Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/cli.html - 2025-06-11T21:11:16.393Z + 2025-06-12T17:23:38.997Z https://docs.axolotl.ai/docs/faq.html - 2025-06-11T21:11:16.395Z + 2025-06-12T17:23:38.998Z https://docs.axolotl.ai/docs/getting-started.html - 2025-06-11T21:11:16.395Z + 2025-06-12T17:23:38.998Z https://docs.axolotl.ai/docs/custom_integrations.html - 2025-06-11T21:11:16.393Z + 2025-06-12T17:23:38.997Z https://docs.axolotl.ai/docs/fsdp_qlora.html - 2025-06-11T21:11:16.395Z + 2025-06-12T17:23:38.998Z https://docs.axolotl.ai/docs/api/common.const.html - 2025-06-11T21:11:47.171Z + 2025-06-12T17:24:09.577Z https://docs.axolotl.ai/docs/api/prompt_tokenizers.html - 2025-06-11T21:11:45.864Z + 2025-06-12T17:24:08.286Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html - 2025-06-11T21:11:46.505Z + 2025-06-12T17:24:08.913Z https://docs.axolotl.ai/docs/api/core.training_args.html - 2025-06-11T21:11:45.994Z + 2025-06-12T17:24:08.405Z https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html - 2025-06-11T21:11:46.429Z + 2025-06-12T17:24:08.838Z https://docs.axolotl.ai/docs/api/utils.dict.html - 2025-06-11T21:11:46.902Z + 2025-06-12T17:24:09.309Z https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html - 2025-06-11T21:11:46.766Z + 2025-06-12T17:24:09.174Z https://docs.axolotl.ai/docs/api/utils.collators.mamba.html - 2025-06-11T21:11:47.211Z + 2025-06-12T17:24:09.616Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html - 2025-06-11T21:11:46.335Z + 2025-06-12T17:24:08.745Z https://docs.axolotl.ai/docs/api/cli.train.html - 2025-06-11T21:11:46.074Z + 2025-06-12T17:24:08.485Z https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html - 2025-06-11T21:11:46.479Z + 2025-06-12T17:24:08.888Z https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html - 2025-06-11T21:11:46.020Z + 2025-06-12T17:24:08.431Z https://docs.axolotl.ai/docs/api/loaders.processor.html - 2025-06-11T21:11:46.315Z + 2025-06-12T17:24:08.725Z https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html - 2025-06-11T21:11:46.034Z + 2025-06-12T17:24:08.445Z https://docs.axolotl.ai/docs/api/core.trainers.mamba.html - 2025-06-11T21:11:46.260Z + 2025-06-12T17:24:08.670Z https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html - 2025-06-11T21:11:46.773Z + 2025-06-12T17:24:09.180Z https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html - 2025-06-11T21:11:47.187Z + 2025-06-12T17:24:09.592Z https://docs.axolotl.ai/docs/api/core.trainers.relora.html - 2025-06-11T21:11:46.264Z + 2025-06-12T17:24:08.674Z https://docs.axolotl.ai/docs/api/core.builders.causal.html - 2025-06-11T21:11:45.884Z + 2025-06-12T17:24:08.307Z https://docs.axolotl.ai/docs/api/core.chat.messages.html - 2025-06-11T21:11:46.017Z + 2025-06-12T17:24:08.428Z https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html - 2025-06-11T21:11:47.165Z + 2025-06-12T17:24:09.570Z https://docs.axolotl.ai/docs/api/cli.quantize.html - 2025-06-11T21:11:46.228Z + 2025-06-12T17:24:08.638Z https://docs.axolotl.ai/docs/api/cli.checks.html - 2025-06-11T21:11:46.108Z + 2025-06-12T17:24:08.519Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html - 2025-06-11T21:11:46.514Z + 2025-06-12T17:24:08.923Z https://docs.axolotl.ai/docs/api/kernels.lora.html - 2025-06-11T21:11:46.632Z + 2025-06-12T17:24:09.040Z https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html - 2025-06-11T21:11:46.997Z + 2025-06-12T17:24:09.404Z https://docs.axolotl.ai/docs/api/loaders.adapter.html - 2025-06-11T21:11:46.320Z + 2025-06-12T17:24:08.730Z https://docs.axolotl.ai/docs/api/index.html - 2025-06-11T21:11:45.724Z + 2025-06-12T17:24:08.148Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html - 2025-06-11T21:11:46.749Z + 2025-06-12T17:24:09.157Z https://docs.axolotl.ai/docs/api/train.html - 2025-06-11T21:11:45.786Z + 2025-06-12T17:24:08.210Z https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html - 2025-06-11T21:11:46.769Z + 2025-06-12T17:24:09.177Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html - 2025-06-11T21:11:46.502Z + 2025-06-12T17:24:08.910Z https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html - 2025-06-11T21:11:47.150Z + 2025-06-12T17:24:09.556Z https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html - 2025-06-11T21:11:47.256Z + 2025-06-12T17:24:09.661Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html - 2025-06-11T21:11:46.408Z + 2025-06-12T17:24:08.817Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html - 2025-06-11T21:11:46.713Z + 2025-06-12T17:24:09.121Z https://docs.axolotl.ai/docs/api/common.architectures.html - 2025-06-11T21:11:47.169Z + 2025-06-12T17:24:09.575Z https://docs.axolotl.ai/docs/api/utils.schemas.utils.html - 2025-06-11T21:11:47.025Z + 2025-06-12T17:24:09.432Z https://docs.axolotl.ai/docs/api/utils.chat_templates.html - 2025-06-11T21:11:46.816Z + 2025-06-12T17:24:09.223Z https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html - 2025-06-11T21:11:47.274Z + 2025-06-12T17:24:09.680Z https://docs.axolotl.ai/docs/api/cli.main.html - 2025-06-11T21:11:46.066Z + 2025-06-12T17:24:08.476Z https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html - 2025-06-11T21:11:46.282Z + 2025-06-12T17:24:08.692Z https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html - 2025-06-11T21:11:47.271Z + 2025-06-12T17:24:09.676Z https://docs.axolotl.ai/docs/api/loaders.model.html - 2025-06-11T21:11:46.305Z + 2025-06-12T17:24:08.715Z https://docs.axolotl.ai/docs/api/utils.tokenization.html - 2025-06-11T21:11:46.806Z + 2025-06-12T17:24:09.213Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html - 2025-06-11T21:11:46.522Z + 2025-06-12T17:24:08.931Z https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html - 2025-06-11T21:11:46.826Z + 2025-06-12T17:24:09.234Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html - 2025-06-11T21:11:46.346Z + 2025-06-12T17:24:08.755Z https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html - 2025-06-11T21:11:46.019Z + 2025-06-12T17:24:08.429Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html - 2025-06-11T21:11:46.506Z + 2025-06-12T17:24:08.915Z https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html - 2025-06-11T21:11:46.544Z + 2025-06-12T17:24:08.953Z https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html - 2025-06-11T21:11:46.704Z + 2025-06-12T17:24:09.113Z https://docs.axolotl.ai/docs/api/prompt_strategies.base.html - 2025-06-11T21:11:46.370Z + 2025-06-12T17:24:08.780Z https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html - 2025-06-11T21:11:46.294Z + 2025-06-12T17:24:08.704Z https://docs.axolotl.ai/docs/api/utils.collators.batching.html - 2025-06-11T21:11:47.207Z + 2025-06-12T17:24:09.613Z https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html - 2025-06-11T21:11:46.738Z + 2025-06-12T17:24:09.146Z https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html - 2025-06-11T21:11:47.158Z + 2025-06-12T17:24:09.564Z https://docs.axolotl.ai/docs/api/utils.schemas.enums.html - 2025-06-11T21:11:47.020Z + 2025-06-12T17:24:09.426Z https://docs.axolotl.ai/docs/api/datasets.html - 2025-06-11T21:11:45.808Z + 2025-06-12T17:24:08.232Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html - 2025-06-11T21:11:46.503Z + 2025-06-12T17:24:08.912Z https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html - 2025-06-11T21:11:46.768Z + 2025-06-12T17:24:09.176Z https://docs.axolotl.ai/docs/api/utils.schemas.model.html - 2025-06-11T21:11:46.956Z + 2025-06-12T17:24:09.364Z https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html - 2025-06-11T21:11:47.149Z + 2025-06-12T17:24:09.555Z https://docs.axolotl.ai/docs/api/utils.trainer.html - 2025-06-11T21:11:46.854Z + 2025-06-12T17:24:09.262Z https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html - 2025-06-11T21:11:47.267Z + 2025-06-12T17:24:09.673Z https://docs.axolotl.ai/docs/api/utils.data.pretraining.html - 2025-06-11T21:11:46.911Z + 2025-06-12T17:24:09.318Z https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html - 2025-06-11T21:11:47.266Z + 2025-06-12T17:24:09.671Z https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html - 2025-06-11T21:11:46.465Z + 2025-06-12T17:24:08.874Z https://docs.axolotl.ai/docs/api/utils.collators.core.html - 2025-06-11T21:11:47.189Z + 2025-06-12T17:24:09.594Z https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html - 2025-06-11T21:11:46.755Z + 2025-06-12T17:24:09.163Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html - 2025-06-11T21:11:46.421Z + 2025-06-12T17:24:08.830Z https://docs.axolotl.ai/docs/api/utils.lora.html - 2025-06-11T21:11:46.820Z + 2025-06-12T17:24:09.228Z https://docs.axolotl.ai/docs/qat.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/quantize.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/ray-integration.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/sequence_parallelism.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/reward_modelling.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/index.html - 2025-06-11T21:11:16.413Z + 2025-06-12T17:23:39.014Z https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html - 2025-06-11T21:11:16.416Z + 2025-06-12T17:23:39.018Z https://docs.axolotl.ai/FAQS.html - 2025-06-11T21:11:16.390Z + 2025-06-12T17:23:38.995Z https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html - 2025-06-11T21:11:16.417Z + 2025-06-12T17:23:39.018Z https://docs.axolotl.ai/TODO.html - 2025-06-11T21:11:16.391Z + 2025-06-12T17:23:38.996Z https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html - 2025-06-11T21:11:16.400Z + 2025-06-12T17:23:39.002Z https://docs.axolotl.ai/docs/torchao.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/config.html - 2025-06-11T21:11:16.393Z + 2025-06-12T17:23:38.997Z https://docs.axolotl.ai/docs/input_output.html - 2025-06-11T21:11:16.398Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/batch_vs_grad.html - 2025-06-11T21:11:16.393Z + 2025-06-12T17:23:38.997Z https://docs.axolotl.ai/docs/api/utils.quantization.html - 2025-06-11T21:11:46.938Z + 2025-06-12T17:24:09.346Z https://docs.axolotl.ai/docs/api/utils.bench.html - 2025-06-11T21:11:46.830Z + 2025-06-12T17:24:09.237Z https://docs.axolotl.ai/docs/api/loaders.tokenizer.html - 2025-06-11T21:11:46.313Z + 2025-06-12T17:24:08.723Z https://docs.axolotl.ai/docs/api/utils.freeze.html - 2025-06-11T21:11:46.837Z + 2025-06-12T17:24:09.245Z https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html - 2025-06-11T21:11:46.469Z + 2025-06-12T17:24:08.878Z https://docs.axolotl.ai/docs/api/utils.schemas.training.html - 2025-06-11T21:11:46.961Z + 2025-06-12T17:24:09.369Z https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html - 2025-06-11T21:11:47.168Z + 2025-06-12T17:24:09.574Z https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html - 2025-06-11T21:11:46.369Z + 2025-06-12T17:24:08.778Z https://docs.axolotl.ai/docs/api/cli.inference.html - 2025-06-11T21:11:46.140Z + 2025-06-12T17:24:08.551Z https://docs.axolotl.ai/docs/api/logging_config.html - 2025-06-11T21:11:45.873Z + 2025-06-12T17:24:08.296Z https://docs.axolotl.ai/docs/api/loaders.constants.html - 2025-06-11T21:11:46.330Z + 2025-06-12T17:24:08.740Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html - 2025-06-11T21:11:46.481Z + 2025-06-12T17:24:08.890Z https://docs.axolotl.ai/docs/api/cli.args.html - 2025-06-11T21:11:46.102Z + 2025-06-12T17:24:08.512Z https://docs.axolotl.ai/docs/api/utils.schemas.trl.html - 2025-06-11T21:11:46.991Z + 2025-06-12T17:24:09.399Z https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html - 2025-06-11T21:11:46.475Z + 2025-06-12T17:24:08.884Z https://docs.axolotl.ai/docs/api/convert.html - 2025-06-11T21:11:45.821Z + 2025-06-12T17:24:08.245Z https://docs.axolotl.ai/docs/api/core.trainers.base.html - 2025-06-11T21:11:46.238Z + 2025-06-12T17:24:08.649Z https://docs.axolotl.ai/docs/api/cli.preprocess.html - 2025-06-11T21:11:46.169Z + 2025-06-12T17:24:08.579Z https://docs.axolotl.ai/docs/api/cli.config.html - 2025-06-11T21:11:46.126Z + 2025-06-12T17:24:08.537Z https://docs.axolotl.ai/docs/api/monkeypatch.relora.html - 2025-06-11T21:11:46.711Z + 2025-06-12T17:24:09.119Z https://docs.axolotl.ai/docs/api/core.chat.format.shared.html - 2025-06-11T21:11:46.022Z + 2025-06-12T17:24:08.432Z https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html - 2025-06-11T21:11:46.271Z + 2025-06-12T17:24:08.681Z https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html - 2025-06-11T21:11:47.281Z + 2025-06-12T17:24:09.686Z https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html - 2025-06-11T21:11:46.909Z + 2025-06-12T17:24:09.317Z https://docs.axolotl.ai/docs/api/cli.evaluate.html - 2025-06-11T21:11:46.082Z + 2025-06-12T17:24:08.493Z https://docs.axolotl.ai/docs/api/core.trainers.trl.html - 2025-06-11T21:11:46.255Z + 2025-06-12T17:24:08.665Z https://docs.axolotl.ai/docs/api/core.builders.base.html - 2025-06-11T21:11:45.880Z + 2025-06-12T17:24:08.302Z https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html - 2025-06-11T21:11:46.758Z + 2025-06-12T17:24:09.166Z https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html - 2025-06-11T21:11:46.799Z + 2025-06-12T17:24:09.206Z https://docs.axolotl.ai/docs/api/utils.distributed.html - 2025-06-11T21:11:46.898Z + 2025-06-12T17:24:09.306Z https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html - 2025-06-11T21:11:46.454Z + 2025-06-12T17:24:08.862Z https://docs.axolotl.ai/docs/api/utils.schemas.config.html - 2025-06-11T21:11:46.949Z + 2025-06-12T17:24:09.357Z https://docs.axolotl.ai/docs/api/cli.utils.html - 2025-06-11T21:11:46.207Z + 2025-06-12T17:24:08.617Z https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html - 2025-06-11T21:11:47.262Z + 2025-06-12T17:24:09.667Z https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html - 2025-06-11T21:11:47.009Z + 2025-06-12T17:24:09.416Z https://docs.axolotl.ai/docs/api/loaders.patch_manager.html - 2025-06-11T21:11:46.328Z + 2025-06-12T17:24:08.738Z https://docs.axolotl.ai/docs/api/monkeypatch.utils.html - 2025-06-11T21:11:46.746Z + 2025-06-12T17:24:09.154Z https://docs.axolotl.ai/docs/api/cli.vllm_serve.html - 2025-06-11T21:11:46.214Z + 2025-06-12T17:24:08.624Z https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html - 2025-06-11T21:11:46.703Z + 2025-06-12T17:24:09.111Z https://docs.axolotl.ai/docs/api/integrations.liger.args.html - 2025-06-11T21:11:47.161Z + 2025-06-12T17:24:09.567Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html - 2025-06-11T21:11:46.410Z + 2025-06-12T17:24:08.818Z https://docs.axolotl.ai/docs/api/utils.data.sft.html - 2025-06-11T21:11:46.918Z + 2025-06-12T17:24:09.325Z https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html - 2025-06-11T21:11:47.216Z + 2025-06-12T17:24:09.621Z https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html - 2025-06-11T21:11:46.442Z + 2025-06-12T17:24:08.850Z https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html - 2025-06-11T21:11:46.765Z + 2025-06-12T17:24:09.173Z https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html - 2025-06-11T21:11:46.161Z + 2025-06-12T17:24:08.571Z https://docs.axolotl.ai/docs/api/cli.merge_lora.html - 2025-06-11T21:11:46.148Z + 2025-06-12T17:24:08.559Z https://docs.axolotl.ai/docs/api/integrations.base.html - 2025-06-11T21:11:47.146Z + 2025-06-12T17:24:09.551Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html - 2025-06-11T21:11:46.339Z + 2025-06-12T17:24:08.749Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html - 2025-06-11T21:11:46.687Z + 2025-06-12T17:24:09.095Z https://docs.axolotl.ai/docs/api/kernels.quantize.html - 2025-06-11T21:11:46.660Z + 2025-06-12T17:24:09.068Z https://docs.axolotl.ai/docs/api/evaluate.html - 2025-06-11T21:11:45.797Z + 2025-06-12T17:24:08.221Z https://docs.axolotl.ai/docs/api/core.builders.rl.html - 2025-06-11T21:11:45.892Z + 2025-06-12T17:24:08.315Z https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html - 2025-06-11T21:11:46.979Z + 2025-06-12T17:24:09.387Z https://docs.axolotl.ai/docs/api/common.datasets.html - 2025-06-11T21:11:47.186Z + 2025-06-12T17:24:09.591Z https://docs.axolotl.ai/docs/api/kernels.utils.html - 2025-06-11T21:11:46.662Z + 2025-06-12T17:24:09.069Z https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html - 2025-06-11T21:11:46.448Z + 2025-06-12T17:24:08.856Z https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html - 2025-06-11T21:11:46.548Z + 2025-06-12T17:24:08.957Z https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html - 2025-06-11T21:11:46.458Z + 2025-06-12T17:24:08.867Z https://docs.axolotl.ai/docs/api/kernels.swiglu.html - 2025-06-11T21:11:46.652Z + 2025-06-12T17:24:09.060Z https://docs.axolotl.ai/docs/api/cli.cloud.base.html - 2025-06-11T21:11:46.217Z + 2025-06-12T17:24:08.627Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html - 2025-06-11T21:11:46.524Z + 2025-06-12T17:24:08.933Z https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html - 2025-06-11T21:11:46.394Z + 2025-06-12T17:24:08.803Z https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html - 2025-06-11T21:11:46.748Z + 2025-06-12T17:24:09.156Z https://docs.axolotl.ai/docs/api/utils.schemas.peft.html - 2025-06-11T21:11:46.988Z + 2025-06-12T17:24:09.395Z https://docs.axolotl.ai/docs/api/core.datasets.chat.html - 2025-06-11T21:11:46.027Z + 2025-06-12T17:24:08.437Z https://docs.axolotl.ai/docs/api/core.trainers.utils.html - 2025-06-11T21:11:46.295Z + 2025-06-12T17:24:08.705Z https://docs.axolotl.ai/docs/api/kernels.geglu.html - 2025-06-11T21:11:46.642Z + 2025-06-12T17:24:09.050Z https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html - 2025-06-11T21:11:46.223Z + 2025-06-12T17:24:08.634Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html - 2025-06-11T21:11:46.689Z + 2025-06-12T17:24:09.097Z https://docs.axolotl.ai/docs/api/utils.schedulers.html - 2025-06-11T21:11:46.879Z + 2025-06-12T17:24:09.286Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html - 2025-06-11T21:11:46.491Z + 2025-06-12T17:24:08.900Z https://docs.axolotl.ai/docs/api/cli.sweeps.html - 2025-06-11T21:11:46.175Z + 2025-06-12T17:24:08.585Z https://docs.axolotl.ai/docs/multimodal.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/debugging.html - 2025-06-11T21:11:16.394Z + 2025-06-12T17:23:38.998Z https://docs.axolotl.ai/docs/multi-gpu.html - 2025-06-11T21:11:16.398Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/lora_optims.html - 2025-06-11T21:11:16.398Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/rlhf.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/amd_hpc.html - 2025-06-11T21:11:16.393Z + 2025-06-12T17:23:38.997Z https://docs.axolotl.ai/docs/installation.html - 2025-06-11T21:11:16.398Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/multipack.html - 2025-06-11T21:11:16.399Z + 2025-06-12T17:23:39.001Z https://docs.axolotl.ai/docs/dataset_preprocessing.html - 2025-06-11T21:11:16.394Z + 2025-06-12T17:23:38.998Z https://docs.axolotl.ai/docs/dataset_loading.html - 2025-06-11T21:11:16.394Z + 2025-06-12T17:23:38.998Z https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html - 2025-06-11T21:11:16.394Z + 2025-06-12T17:23:38.997Z https://docs.axolotl.ai/docs/dataset-formats/template_free.html - 2025-06-11T21:11:16.394Z + 2025-06-12T17:23:38.998Z https://docs.axolotl.ai/docs/dataset-formats/index.html - 2025-06-11T21:11:16.394Z + 2025-06-12T17:23:38.997Z https://docs.axolotl.ai/docs/dataset-formats/pretraining.html - 2025-06-11T21:11:16.394Z + 2025-06-12T17:23:38.997Z