diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 01606f902..7ff712757 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -29,12 +29,12 @@ jobs: - cuda: 126 cuda_version: 12.6.3 python_version: "3.11" - pytorch: 2.7.0 + pytorch: 2.7.1 axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" - pytorch: 2.7.0 + pytorch: 2.7.1 axolotl_extras: runs-on: axolotl-gpu-runner steps: @@ -97,12 +97,12 @@ jobs: - cuda: 126 cuda_version: 12.6.3 python_version: "3.11" - pytorch: 2.7.0 + pytorch: 2.7.1 axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" - pytorch: 2.7.0 + pytorch: 2.7.1 axolotl_extras: runs-on: axolotl-gpu-runner steps: diff --git a/.nojekyll b/.nojekyll index 16a82cc64..81c8edb89 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -91cd29d0 \ No newline at end of file +72083c85 \ No newline at end of file diff --git a/docs/api/core.training_args.html b/docs/api/core.training_args.html index a74183f9e..f35ea760a 100644 --- a/docs/api/core.training_args.html +++ b/docs/api/core.training_args.html @@ -547,43 +547,44 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); sample_packing_bin_size=200, sample_packing_group_size=100000, max_seq_length=2048, - relora_steps=None, - relora_warmup_steps=None, - relora_anneal_steps=None, - relora_prune_ratio=0.9, - bench_split='eval', - bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', - do_bench_eval=False, - do_causal_lm_eval=False, - max_bench_samples=None, - bench_source_max_len=2048, - dataloader_prefetch_factor=None, - cosine_min_lr_ratio=None, - cosine_constant_lr_ratio=None, - loraplus_lr_ratio=None, - loraplus_lr_embedding=1e-06, - embedding_lr_scale=None, - lr_groups=None, - embedding_lr=None, - qlora=False, - orpo_alpha=None, - lisa_n_layers=None, - lisa_step_interval=None, - lisa_layers_attribute=None, - curriculum_sampling=None, - alternate_lr_scheduler_type=None, - chat_template=None, - kd_ce_alpha=None, - kd_alpha=1.0, - kd_temperature=1.0, - kd_zscore_base_temp=None, - kd_top_k_before_softmax=None, - adam_beta3=None, - adam_epsilon2=None, - image_size=None, - image_resize_algorithm=None, - simpo_gamma=None, -) + dataset_num_proc=None, + relora_steps=None, + relora_warmup_steps=None, + relora_anneal_steps=None, + relora_prune_ratio=0.9, + bench_split='eval', + bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', + do_bench_eval=False, + do_causal_lm_eval=False, + max_bench_samples=None, + bench_source_max_len=2048, + dataloader_prefetch_factor=None, + cosine_min_lr_ratio=None, + cosine_constant_lr_ratio=None, + loraplus_lr_ratio=None, + loraplus_lr_embedding=1e-06, + embedding_lr_scale=None, + lr_groups=None, + embedding_lr=None, + qlora=False, + orpo_alpha=None, + lisa_n_layers=None, + lisa_step_interval=None, + lisa_layers_attribute=None, + curriculum_sampling=None, + alternate_lr_scheduler_type=None, + chat_template=None, + kd_ce_alpha=None, + kd_alpha=1.0, + kd_temperature=1.0, + kd_zscore_base_temp=None, + kd_top_k_before_softmax=None, + adam_beta3=None, + adam_epsilon2=None, + image_size=None, + image_resize_algorithm=None, + simpo_gamma=None, +)
CPO config for CPO training
KTO config for KTO training
ORPO config for ORPO training
PRM config for PRM training
Reward config for Reward training
Training arguments for Causal trainer
This code is duplicated due to HF TrainingArguments not setting output_dir with a default value so it can’t be used as a mixin.
@@ -862,42 +868,43 @@ default value so it can’t be used as a mixin. sample_packing_bin_size=200, sample_packing_group_size=100000, max_seq_length=2048, - relora_steps=None, - relora_warmup_steps=None, - relora_anneal_steps=None, - relora_prune_ratio=0.9, - bench_split='eval', - bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', - do_bench_eval=False, - do_causal_lm_eval=False, - max_bench_samples=None, - bench_source_max_len=2048, - dataloader_prefetch_factor=None, - cosine_min_lr_ratio=None, - cosine_constant_lr_ratio=None, - loraplus_lr_ratio=None, - loraplus_lr_embedding=1e-06, - embedding_lr_scale=None, - lr_groups=None, - embedding_lr=None, - qlora=False, - orpo_alpha=None, - lisa_n_layers=None, - lisa_step_interval=None, - lisa_layers_attribute=None, - curriculum_sampling=None, - alternate_lr_scheduler_type=None, - chat_template=None, - kd_ce_alpha=None, - kd_alpha=1.0, - kd_temperature=1.0, - kd_zscore_base_temp=None, - kd_top_k_before_softmax=None, - adam_beta3=None, - adam_epsilon2=None, - image_size=None, - image_resize_algorithm=None, -) + dataset_num_proc=None, + relora_steps=None, + relora_warmup_steps=None, + relora_anneal_steps=None, + relora_prune_ratio=0.9, + bench_split='eval', + bench_dataset='pharaouk/dharma-1/dharma_1_mini.json', + do_bench_eval=False, + do_causal_lm_eval=False, + max_bench_samples=None, + bench_source_max_len=2048, + dataloader_prefetch_factor=None, + cosine_min_lr_ratio=None, + cosine_constant_lr_ratio=None, + loraplus_lr_ratio=None, + loraplus_lr_embedding=1e-06, + embedding_lr_scale=None, + lr_groups=None, + embedding_lr=None, + qlora=False, + orpo_alpha=None, + lisa_n_layers=None, + lisa_step_interval=None, + lisa_layers_attribute=None, + curriculum_sampling=None, + alternate_lr_scheduler_type=None, + chat_template=None, + kd_ce_alpha=None, + kd_alpha=1.0, + kd_temperature=1.0, + kd_zscore_base_temp=None, + kd_top_k_before_softmax=None, + adam_beta3=None, + adam_epsilon2=None, + image_size=None, + image_resize_algorithm=None, +)Mixin class for the Axolotl training args.
diff --git a/docs/api/utils.samplers.multipack.html b/docs/api/utils.samplers.multipack.html index 6d351ba0f..75a23093a 100644 --- a/docs/api/utils.samplers.multipack.html +++ b/docs/api/utils.samplers.multipack.html @@ -520,7 +520,7 @@ into fixed-capacity batches to optimize memory usage and training throughput. lengths, packing_efficiency_estimate=1.0, drop_last=False, - num_count_samples=16, + num_count_samples=8, sequential=False, group_size=100000, bin_size=200, diff --git a/docs/qat.html b/docs/qat.html index d94a67ed7..fff59d03e 100644 --- a/docs/qat.html +++ b/docs/qat.html @@ -512,7 +512,7 @@ and the QAT documentation in the weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8" group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after -Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the quantize command to do this.
Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the quantize command to do this.
Axolotl is a tool designed to streamline post-training for various AI models. -Post-training refers to any modifications or additional training performed on -pre-trained models - including full model fine-tuning, parameter-efficient tuning (like -LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment -techniques. With support for multiple model architectures and training configurations, -Axolotl makes it easy to get started with these techniques.
-Axolotl is designed to work with YAML config files that contain everything you need to -preprocess a dataset, train or fine-tune a model, run model inference or evaluation, -and much more.
+Axolotl is a tool designed to streamline post-training for various AI models.
Features:
Requirements:
@@ -562,22 +566,12 @@ and much more.That’s it! Check out our Getting Started Guide for a more detailed walkthrough.
Contributions are welcome! Please see our Contributing Guide for details.
| - | fp16/fp32 | -lora | -qlora | -gptq | -gptq w/flash attn | -flash attn | -xformers attn | -
|---|---|---|---|---|---|---|---|
| llama | -✅ | -✅ | -✅ | -✅ | -✅ | -✅ | -✅ | -
| Mistral | -✅ | -✅ | -✅ | -✅ | -✅ | -✅ | -✅ | -
| Mixtral-MoE | -✅ | -✅ | -✅ | -❓ | -❓ | -❓ | -❓ | -
| Mixtral8X22 | -✅ | -✅ | -✅ | -❓ | -❓ | -❓ | -❓ | -
| Pythia | -✅ | -✅ | -✅ | -❌ | -❌ | -❌ | -❓ | -
| cerebras | -✅ | -✅ | -✅ | -❌ | -❌ | -❌ | -❓ | -
| btlm | -✅ | -✅ | -✅ | -❌ | -❌ | -❌ | -❓ | -
| mpt | -✅ | -❌ | -❓ | -❌ | -❌ | -❌ | -❓ | -
| falcon | -✅ | -✅ | -✅ | -❌ | -❌ | -❌ | -❓ | -
| gpt-j | -✅ | -✅ | -✅ | -❌ | -❌ | -❓ | -❓ | -
| XGen | -✅ | -❓ | -✅ | -❓ | -❓ | -❓ | -✅ | -
| phi | -✅ | -✅ | -✅ | -❓ | -❓ | -❓ | -❓ | -
| RWKV | -✅ | -❓ | -❓ | -❓ | -❓ | -❓ | -❓ | -
| Qwen | -✅ | -✅ | -✅ | -❓ | -❓ | -❓ | -❓ | -
| Gemma | -✅ | -✅ | -✅ | -❓ | -❓ | -✅ | -❓ | -
| Jamba | -✅ | -✅ | -✅ | -❓ | -❓ | -✅ | -❓ | -
✅: supported -❌: not supported -❓: untested
-Thank you to our sponsors who help make Axolotl possible:
diff --git a/search.json b/search.json index 41e427623..53e98255f 100644 --- a/search.json +++ b/search.json @@ -644,14 +644,14 @@ "href": "docs/api/core.training_args.html", "title": "core.training_args", "section": "", - "text": "core.training_args\nextra axolotl specific training args\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args." + "text": "core.training_args\nextra axolotl specific training args\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args." }, { "objectID": "docs/api/core.training_args.html#classes", "href": "docs/api/core.training_args.html#classes", "title": "core.training_args", "section": "", - "text": "Name\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args." + "text": "Name\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n model_type=None,\n lr_quadratic_warmup=False,\n pretraining=False,\n sample_packing=False,\n sample_packing_sequentially=False,\n multipack_real_batches=False,\n eval_sample_packing=None,\n sample_packing_efficiency=1.0,\n sample_packing_bin_size=200,\n sample_packing_group_size=100000,\n max_seq_length=2048,\n dataset_num_proc=None,\n relora_steps=None,\n relora_warmup_steps=None,\n relora_anneal_steps=None,\n relora_prune_ratio=0.9,\n bench_split='eval',\n bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n do_bench_eval=False,\n do_causal_lm_eval=False,\n max_bench_samples=None,\n bench_source_max_len=2048,\n dataloader_prefetch_factor=None,\n cosine_min_lr_ratio=None,\n cosine_constant_lr_ratio=None,\n loraplus_lr_ratio=None,\n loraplus_lr_embedding=1e-06,\n embedding_lr_scale=None,\n lr_groups=None,\n embedding_lr=None,\n qlora=False,\n orpo_alpha=None,\n lisa_n_layers=None,\n lisa_step_interval=None,\n lisa_layers_attribute=None,\n curriculum_sampling=None,\n alternate_lr_scheduler_type=None,\n chat_template=None,\n kd_ce_alpha=None,\n kd_alpha=1.0,\n kd_temperature=1.0,\n kd_zscore_base_temp=None,\n kd_top_k_before_softmax=None,\n adam_beta3=None,\n adam_epsilon2=None,\n image_size=None,\n image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args." }, { "objectID": "docs/api/prompt_strategies.user_defined.html", @@ -1127,14 +1127,14 @@ "href": "docs/api/utils.samplers.multipack.html", "title": "utils.samplers.multipack", "section": "", - "text": "utils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\ninto fixed-capacity batches to optimize memory usage and training throughput.\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n sampler,\n batch_size,\n batch_max_len,\n lengths,\n packing_efficiency_estimate=1.0,\n drop_last=False,\n num_count_samples=16,\n sequential=False,\n group_size=100000,\n bin_size=200,\n num_processes=None,\n safe_mode=True,\n **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n sequence_lengths,\n rank,\n bin_capacity,\n num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n sequence_lengths,\n group_offset,\n bin_capacity,\n max_bins,\n bin_size,\n safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n sequence_lengths,\n bin_capacity,\n group_size,\n bin_size,\n num_processes=None,\n safe_mode=True,\n mp_start_method='spawn',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'spawn'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it." + "text": "utils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\ninto fixed-capacity batches to optimize memory usage and training throughput.\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n sampler,\n batch_size,\n batch_max_len,\n lengths,\n packing_efficiency_estimate=1.0,\n drop_last=False,\n num_count_samples=8,\n sequential=False,\n group_size=100000,\n bin_size=200,\n num_processes=None,\n safe_mode=True,\n **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n sequence_lengths,\n rank,\n bin_capacity,\n num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n sequence_lengths,\n group_offset,\n bin_capacity,\n max_bins,\n bin_size,\n safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n sequence_lengths,\n bin_capacity,\n group_size,\n bin_size,\n num_processes=None,\n safe_mode=True,\n mp_start_method='spawn',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'spawn'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it." }, { "objectID": "docs/api/utils.samplers.multipack.html#classes", "href": "docs/api/utils.samplers.multipack.html#classes", "title": "utils.samplers.multipack", "section": "", - "text": "Name\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n sampler,\n batch_size,\n batch_max_len,\n lengths,\n packing_efficiency_estimate=1.0,\n drop_last=False,\n num_count_samples=16,\n sequential=False,\n group_size=100000,\n bin_size=200,\n num_processes=None,\n safe_mode=True,\n **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs" + "text": "Name\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n sampler,\n batch_size,\n batch_max_len,\n lengths,\n packing_efficiency_estimate=1.0,\n drop_last=False,\n num_count_samples=8,\n sequential=False,\n group_size=100000,\n bin_size=200,\n num_processes=None,\n safe_mode=True,\n **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs" }, { "objectID": "docs/api/utils.samplers.multipack.html#functions", @@ -1833,11 +1833,21 @@ ] }, { - "objectID": "index.html", - "href": "index.html", + "objectID": "index.html#latest-updates", + "href": "index.html#latest-updates", "title": "Axolotl", - "section": "", - "text": "Axolotl is a tool designed to streamline post-training for various AI models.\nPost-training refers to any modifications or additional training performed on\npre-trained models - including full model fine-tuning, parameter-efficient tuning (like\nLoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment\ntechniques. With support for multiple model architectures and training configurations,\nAxolotl makes it easy to get started with these techniques.\nAxolotl is designed to work with YAML config files that contain everything you need to\npreprocess a dataset, train or fine-tune a model, run model inference or evaluation,\nand much more.\nFeatures:", + "section": "🎉 Latest Updates", + "text": "🎉 Latest Updates\n\n2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the docs to learn more!\n2025/04: Llama 4 support has been added in Axolotl. See examples to start training your own Llama 4 models with Axolotl’s linearized version!\n2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the blog and docs to learn how to scale your context length when fine-tuning.\n2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the docs to fine-tune your own!\n2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the docs to give it a try.\n2025/02: Axolotl has added GRPO support. Dive into our blog and GRPO example and have some fun!\n2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See docs.", + "crumbs": [ + "Home" + ] + }, + { + "objectID": "index.html#overview", + "href": "index.html#overview", + "title": "Axolotl", + "section": "✨ Overview", + "text": "✨ Overview\nAxolotl is a tool designed to streamline post-training for various AI models.\nFeatures:\n\nMultiple Model Support: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.\nTraining Methods: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).\nEasy Configuration: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.\nPerformance Optimizations: Multipacking, Flash Attention, Xformers, Flex Attention, Liger Kernel, Cut Cross Entropy, Sequence Parallelism (SP), LoRA optimizations, Multi-GPU training (FSDP1, FSDP2, DeepSpeed), Multi-node training (Torchrun, Ray), and many more!\nFlexible Dataset Handling: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.\nCloud Ready: We ship Docker images and also PyPI packages for use on cloud platforms and local hardware.", "crumbs": [ "Home" ] @@ -1852,22 +1862,12 @@ "Home" ] }, - { - "objectID": "index.html#key-features", - "href": "index.html#key-features", - "title": "Axolotl", - "section": "✨ Key Features", - "text": "✨ Key Features\n\nMultiple Model Support: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more\nTraining Methods: Full fine-tuning, LoRA, QLoRA, and more\nEasy Configuration: Simple YAML files to control your training setup\nPerformance Optimizations: Flash Attention, xformers, multi-GPU training\nFlexible Dataset Handling: Use various formats and custom datasets\nCloud Ready: Run on cloud platforms or local hardware", - "crumbs": [ - "Home" - ] - }, { "objectID": "index.html#documentation", "href": "index.html#documentation", "title": "Axolotl", "section": "📚 Documentation", - "text": "📚 Documentation\n\nInstallation Options - Detailed setup instructions for different environments\nConfiguration Guide - Full configuration options and examples\nDataset Guide - Supported formats and how to use them\nMulti-GPU Training\nMulti-Node Training\nMultipacking\nAPI Reference - Auto-generated code documentation\nFAQ - Frequently asked questions", + "text": "📚 Documentation\n\nInstallation Options - Detailed setup instructions for different environments\nConfiguration Guide - Full configuration options and examples\nDataset Loading - Loading datasets from various sources\nDataset Guide - Supported formats and how to use them\nMulti-GPU Training\nMulti-Node Training\nMultipacking\nAPI Reference - Auto-generated code documentation\nFAQ - Frequently asked questions", "crumbs": [ "Home" ] @@ -1892,16 +1892,6 @@ "Home" ] }, - { - "objectID": "index.html#supported-models", - "href": "index.html#supported-models", - "title": "Axolotl", - "section": "Supported Models", - "text": "Supported Models\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nfp16/fp32\nlora\nqlora\ngptq\ngptq w/flash attn\nflash attn\nxformers attn\n\n\n\n\nllama\n✅\n✅\n✅\n✅\n✅\n✅\n✅\n\n\nMistral\n✅\n✅\n✅\n✅\n✅\n✅\n✅\n\n\nMixtral-MoE\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nMixtral8X22\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nPythia\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\ncerebras\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\nbtlm\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\nmpt\n✅\n❌\n❓\n❌\n❌\n❌\n❓\n\n\nfalcon\n✅\n✅\n✅\n❌\n❌\n❌\n❓\n\n\ngpt-j\n✅\n✅\n✅\n❌\n❌\n❓\n❓\n\n\nXGen\n✅\n❓\n✅\n❓\n❓\n❓\n✅\n\n\nphi\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nRWKV\n✅\n❓\n❓\n❓\n❓\n❓\n❓\n\n\nQwen\n✅\n✅\n✅\n❓\n❓\n❓\n❓\n\n\nGemma\n✅\n✅\n✅\n❓\n❓\n✅\n❓\n\n\nJamba\n✅\n✅\n✅\n❓\n❓\n✅\n❓\n\n\n\n✅: supported\n❌: not supported\n❓: untested", - "crumbs": [ - "Home" - ] - }, { "objectID": "index.html#sponsors", "href": "index.html#sponsors", diff --git a/sitemap.xml b/sitemap.xml index 62452cce5..83835e8ff 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,758 +2,758 @@