From 505e4cba9f3b2fe1529c56afb23bd97d0b013c92 Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Wed, 28 May 2025 08:53:35 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- FAQS.html | 9 + TODO.html | 9 + docs/amd_hpc.html | 9 + docs/api/cli.args.html | 9 + docs/api/cli.checks.html | 9 + docs/api/cli.cloud.base.html | 9 + docs/api/cli.cloud.modal_.html | 9 + docs/api/cli.config.html | 9 + docs/api/cli.evaluate.html | 9 + docs/api/cli.inference.html | 9 + docs/api/cli.main.html | 9 + docs/api/cli.merge_lora.html | 9 + docs/api/cli.merge_sharded_fsdp_weights.html | 9 + docs/api/cli.preprocess.html | 9 + docs/api/cli.sweeps.html | 9 + docs/api/cli.train.html | 9 + docs/api/cli.utils.html | 9 + docs/api/cli.vllm_serve.html | 9 + docs/api/common.architectures.html | 9 + docs/api/common.const.html | 9 + docs/api/common.datasets.html | 9 + docs/api/convert.html | 9 + docs/api/core.chat.format.chatml.html | 9 + docs/api/core.chat.format.llama3x.html | 9 + docs/api/core.chat.format.shared.html | 9 + docs/api/core.chat.messages.html | 9 + docs/api/core.datasets.chat.html | 9 + ...core.datasets.transforms.chat_builder.html | 9 + docs/api/core.trainer_builder.html | 9 + docs/api/core.trainers.base.html | 9 + docs/api/core.trainers.dpo.trainer.html | 9 + docs/api/core.trainers.grpo.sampler.html | 9 + docs/api/core.trainers.grpo.trainer.html | 9 + docs/api/core.trainers.mamba.html | 9 + docs/api/core.trainers.mixins.optimizer.html | 9 + ...core.trainers.mixins.rng_state_loader.html | 9 + docs/api/core.trainers.mixins.scheduler.html | 9 + docs/api/core.trainers.relora.html | 9 + docs/api/core.trainers.trl.html | 9 + docs/api/core.trainers.utils.html | 9 + docs/api/core.training_args.html | 9 + docs/api/datasets.html | 9 + docs/api/evaluate.html | 9 + docs/api/index.html | 9 + docs/api/integrations.base.html | 9 + .../integrations.cut_cross_entropy.args.html | 9 + docs/api/integrations.grokfast.optimizer.html | 9 + docs/api/integrations.kd.trainer.html | 9 + docs/api/integrations.liger.args.html | 9 + docs/api/integrations.lm_eval.args.html | 9 + docs/api/integrations.spectrum.args.html | 9 + docs/api/kernels.geglu.html | 9 + docs/api/kernels.lora.html | 9 + docs/api/kernels.quantize.html | 9 + docs/api/kernels.swiglu.html | 9 + docs/api/kernels.utils.html | 9 + docs/api/loaders.adapter.html | 9 + docs/api/loaders.constants.html | 9 + docs/api/loaders.model.html | 9 + docs/api/loaders.patch_manager.html | 9 + docs/api/loaders.processor.html | 9 + docs/api/loaders.tokenizer.html | 9 + docs/api/logging_config.html | 9 + docs/api/models.mamba.modeling_mamba.html | 9 + docs/api/monkeypatch.attention.mllama.html | 9 + .../monkeypatch.btlm_attn_hijack_flash.html | 9 + ...onkeypatch.data.batch_dataset_fetcher.html | 9 + ...ch.gradient_checkpointing.offload_cpu.html | 9 + ...h.gradient_checkpointing.offload_disk.html | 9 + .../monkeypatch.llama_attn_hijack_flash.html | 9 + ...onkeypatch.llama_attn_hijack_xformers.html | 9 + docs/api/monkeypatch.llama_expand_mask.html | 9 + .../monkeypatch.llama_patch_multipack.html | 9 + docs/api/monkeypatch.lora_kernels.html | 9 + ...monkeypatch.mistral_attn_hijack_flash.html | 9 + docs/api/monkeypatch.mixtral.html | 9 + docs/api/monkeypatch.multipack.html | 9 + docs/api/monkeypatch.relora.html | 9 + ...onkeypatch.stablelm_attn_hijack_flash.html | 9 + docs/api/monkeypatch.trainer_fsdp_optim.html | 9 + .../monkeypatch.transformers_fa_utils.html | 9 + docs/api/monkeypatch.unsloth_.html | 9 + docs/api/monkeypatch.utils.html | 9 + docs/api/prompt_strategies.alpaca_chat.html | 9 + .../prompt_strategies.alpaca_instruct.html | 9 + .../prompt_strategies.alpaca_w_system.html | 9 + docs/api/prompt_strategies.base.html | 9 + ...rompt_strategies.bradley_terry.llama3.html | 9 + docs/api/prompt_strategies.chat_template.html | 9 + docs/api/prompt_strategies.completion.html | 9 + .../prompt_strategies.dpo.chat_template.html | 9 + docs/api/prompt_strategies.dpo.chatml.html | 9 + docs/api/prompt_strategies.dpo.llama3.html | 9 + .../prompt_strategies.dpo.passthrough.html | 9 + .../prompt_strategies.dpo.user_defined.html | 9 + docs/api/prompt_strategies.dpo.zephyr.html | 9 + docs/api/prompt_strategies.input_output.html | 9 + docs/api/prompt_strategies.kto.chatml.html | 9 + docs/api/prompt_strategies.kto.llama3.html | 9 + .../prompt_strategies.kto.user_defined.html | 9 + docs/api/prompt_strategies.llama2_chat.html | 9 + docs/api/prompt_strategies.messages.chat.html | 9 + docs/api/prompt_strategies.metharme.html | 9 + docs/api/prompt_strategies.orcamini.html | 9 + .../prompt_strategies.orpo.chat_template.html | 9 + docs/api/prompt_strategies.pygmalion.html | 9 + ...prompt_strategies.stepwise_supervised.html | 9 + docs/api/prompt_strategies.user_defined.html | 9 + docs/api/prompt_tokenizers.html | 9 + docs/api/train.html | 9 + docs/api/utils.bench.html | 9 + docs/api/utils.callbacks.comet_.html | 9 + docs/api/utils.callbacks.lisa.html | 9 + docs/api/utils.callbacks.mlflow_.html | 9 + docs/api/utils.callbacks.perplexity.html | 9 + docs/api/utils.callbacks.profiler.html | 9 + docs/api/utils.chat_templates.html | 9 + docs/api/utils.collators.batching.html | 9 + docs/api/utils.collators.core.html | 9 + docs/api/utils.collators.mamba.html | 9 + docs/api/utils.collators.mm_chat.html | 9 + .../utils.ctx_managers.sequence_parallel.html | 9 + docs/api/utils.data.pretraining.html | 9 + docs/api/utils.data.sft.html | 9 + docs/api/utils.dict.html | 9 + docs/api/utils.distributed.html | 9 + docs/api/utils.freeze.html | 9 + docs/api/utils.lora.html | 9 + docs/api/utils.model_shard_quant.html | 9 + docs/api/utils.optimizers.adopt.html | 9 + docs/api/utils.samplers.multipack.html | 9 + docs/api/utils.schedulers.html | 9 + docs/api/utils.schemas.config.html | 9 + docs/api/utils.schemas.datasets.html | 9 + docs/api/utils.schemas.enums.html | 9 + docs/api/utils.schemas.integrations.html | 9 + docs/api/utils.schemas.model.html | 9 + docs/api/utils.schemas.multimodal.html | 9 + docs/api/utils.schemas.peft.html | 9 + docs/api/utils.schemas.training.html | 9 + docs/api/utils.schemas.trl.html | 9 + docs/api/utils.schemas.utils.html | 9 + docs/api/utils.tokenization.html | 9 + docs/api/utils.trainer.html | 9 + docs/batch_vs_grad.html | 9 + docs/cli.html | 9 + docs/config.html | 1315 +++++++++-------- docs/custom_integrations.html | 9 + docs/dataset-formats/conversation.html | 9 + docs/dataset-formats/index.html | 42 +- docs/dataset-formats/inst_tune.html | 9 + docs/dataset-formats/pretraining.html | 9 + docs/dataset-formats/stepwise_supervised.html | 9 + docs/dataset-formats/template_free.html | 9 + docs/dataset-formats/tokenized.html | 9 + docs/dataset_loading.html | 93 +- docs/dataset_preprocessing.html | 9 + docs/debugging.html | 9 + docs/docker.html | 9 + docs/faq.html | 9 + docs/fsdp_qlora.html | 9 + docs/getting-started.html | 9 + docs/inference.html | 9 + docs/input_output.html | 9 + docs/installation.html | 9 + docs/lora_optims.html | 22 + docs/lr_groups.html | 9 + docs/mac.html | 9 + docs/multi-gpu.html | 9 + docs/multi-node.html | 9 + docs/multimodal.html | 9 + docs/multipack.html | 9 + docs/nccl.html | 9 + docs/ray-integration.html | 9 + docs/reward_modelling.html | 9 + docs/rlhf.html | 47 +- docs/sequence_parallelism.html | 9 + docs/torchao.html | 9 + docs/unsloth.html | 9 + .../colab-axolotl-example.html | 9 + index.html | 9 + search.json | 14 +- sitemap.xml | 366 ++--- src/axolotl/integrations/LICENSE.html | 9 + .../cut_cross_entropy/ACKNOWLEDGEMENTS.html | 9 + 186 files changed, 2601 insertions(+), 902 deletions(-) diff --git a/.nojekyll b/.nojekyll index ad2c8e29a..086bdb8d2 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -925c4a19 \ No newline at end of file +de37548f \ No newline at end of file diff --git a/FAQS.html b/FAQS.html index 897108846..42906f7c8 100644 --- a/FAQS.html +++ b/FAQS.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/TODO.html b/TODO.html index 2f07c7d39..a88a3ef41 100644 --- a/TODO.html +++ b/TODO.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/amd_hpc.html b/docs/amd_hpc.html index 597aa97e2..9e373d17a 100644 --- a/docs/amd_hpc.html +++ b/docs/amd_hpc.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.args.html b/docs/api/cli.args.html index 2ee791277..838a215cf 100644 --- a/docs/api/cli.args.html +++ b/docs/api/cli.args.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.checks.html b/docs/api/cli.checks.html index 2a08f9364..22a412e9f 100644 --- a/docs/api/cli.checks.html +++ b/docs/api/cli.checks.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.cloud.base.html b/docs/api/cli.cloud.base.html index d04b69fcd..fe0f1bb16 100644 --- a/docs/api/cli.cloud.base.html +++ b/docs/api/cli.cloud.base.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.cloud.modal_.html b/docs/api/cli.cloud.modal_.html index 2e9303462..64bad8695 100644 --- a/docs/api/cli.cloud.modal_.html +++ b/docs/api/cli.cloud.modal_.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.config.html b/docs/api/cli.config.html index 2f6891bb1..1b26fa60c 100644 --- a/docs/api/cli.config.html +++ b/docs/api/cli.config.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.evaluate.html b/docs/api/cli.evaluate.html index a69ad710e..d081945db 100644 --- a/docs/api/cli.evaluate.html +++ b/docs/api/cli.evaluate.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.inference.html b/docs/api/cli.inference.html index 0d933efd1..24ee0e988 100644 --- a/docs/api/cli.inference.html +++ b/docs/api/cli.inference.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.main.html b/docs/api/cli.main.html index 23af0874e..02440fd1e 100644 --- a/docs/api/cli.main.html +++ b/docs/api/cli.main.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.merge_lora.html b/docs/api/cli.merge_lora.html index 7f193f4ac..422b2050a 100644 --- a/docs/api/cli.merge_lora.html +++ b/docs/api/cli.merge_lora.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.merge_sharded_fsdp_weights.html b/docs/api/cli.merge_sharded_fsdp_weights.html index 22fca8bf4..927b7c08e 100644 --- a/docs/api/cli.merge_sharded_fsdp_weights.html +++ b/docs/api/cli.merge_sharded_fsdp_weights.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.preprocess.html b/docs/api/cli.preprocess.html index f2fa1b798..dd9a79f63 100644 --- a/docs/api/cli.preprocess.html +++ b/docs/api/cli.preprocess.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.sweeps.html b/docs/api/cli.sweeps.html index e5727ce78..387d44f9d 100644 --- a/docs/api/cli.sweeps.html +++ b/docs/api/cli.sweeps.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.train.html b/docs/api/cli.train.html index f5cfb3674..b71d6437d 100644 --- a/docs/api/cli.train.html +++ b/docs/api/cli.train.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.utils.html b/docs/api/cli.utils.html index 85990de4e..a1e4743cc 100644 --- a/docs/api/cli.utils.html +++ b/docs/api/cli.utils.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/cli.vllm_serve.html b/docs/api/cli.vllm_serve.html index 17f3cee66..8ed174f84 100644 --- a/docs/api/cli.vllm_serve.html +++ b/docs/api/cli.vllm_serve.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/common.architectures.html b/docs/api/common.architectures.html index dd3a5bf09..2ec474593 100644 --- a/docs/api/common.architectures.html +++ b/docs/api/common.architectures.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/common.const.html b/docs/api/common.const.html index 1b91b9a1c..392add601 100644 --- a/docs/api/common.const.html +++ b/docs/api/common.const.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/common.datasets.html b/docs/api/common.datasets.html index 605b71905..5f4426be3 100644 --- a/docs/api/common.datasets.html +++ b/docs/api/common.datasets.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/convert.html b/docs/api/convert.html index 30f6fa3fa..6d1ec4a92 100644 --- a/docs/api/convert.html +++ b/docs/api/convert.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.chat.format.chatml.html b/docs/api/core.chat.format.chatml.html index 25570c324..e0ecfcf34 100644 --- a/docs/api/core.chat.format.chatml.html +++ b/docs/api/core.chat.format.chatml.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/core.chat.format.llama3x.html b/docs/api/core.chat.format.llama3x.html index 66971b609..1320fb05d 100644 --- a/docs/api/core.chat.format.llama3x.html +++ b/docs/api/core.chat.format.llama3x.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/core.chat.format.shared.html b/docs/api/core.chat.format.shared.html index 95a94f8ef..7329914ab 100644 --- a/docs/api/core.chat.format.shared.html +++ b/docs/api/core.chat.format.shared.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/core.chat.messages.html b/docs/api/core.chat.messages.html index 9226a162b..9d009aa7d 100644 --- a/docs/api/core.chat.messages.html +++ b/docs/api/core.chat.messages.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.datasets.chat.html b/docs/api/core.datasets.chat.html index ace370750..133a7962e 100644 --- a/docs/api/core.datasets.chat.html +++ b/docs/api/core.datasets.chat.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.datasets.transforms.chat_builder.html b/docs/api/core.datasets.transforms.chat_builder.html index bd89a7641..b0f37ae12 100644 --- a/docs/api/core.datasets.transforms.chat_builder.html +++ b/docs/api/core.datasets.transforms.chat_builder.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainer_builder.html b/docs/api/core.trainer_builder.html index 68a707f4a..9ced597d7 100644 --- a/docs/api/core.trainer_builder.html +++ b/docs/api/core.trainer_builder.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainers.base.html b/docs/api/core.trainers.base.html index 9ee346ea7..7848c85b4 100644 --- a/docs/api/core.trainers.base.html +++ b/docs/api/core.trainers.base.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainers.dpo.trainer.html b/docs/api/core.trainers.dpo.trainer.html index fddb23c31..30b58c254 100644 --- a/docs/api/core.trainers.dpo.trainer.html +++ b/docs/api/core.trainers.dpo.trainer.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainers.grpo.sampler.html b/docs/api/core.trainers.grpo.sampler.html index 3dfba36be..ca55cc873 100644 --- a/docs/api/core.trainers.grpo.sampler.html +++ b/docs/api/core.trainers.grpo.sampler.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainers.grpo.trainer.html b/docs/api/core.trainers.grpo.trainer.html index a61d9ea0b..ad083dcd5 100644 --- a/docs/api/core.trainers.grpo.trainer.html +++ b/docs/api/core.trainers.grpo.trainer.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainers.mamba.html b/docs/api/core.trainers.mamba.html index 5f8943e04..2a694eb47 100644 --- a/docs/api/core.trainers.mamba.html +++ b/docs/api/core.trainers.mamba.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainers.mixins.optimizer.html b/docs/api/core.trainers.mixins.optimizer.html index 03c3f5f36..7894c6388 100644 --- a/docs/api/core.trainers.mixins.optimizer.html +++ b/docs/api/core.trainers.mixins.optimizer.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainers.mixins.rng_state_loader.html b/docs/api/core.trainers.mixins.rng_state_loader.html index 03d099777..4f2b83abf 100644 --- a/docs/api/core.trainers.mixins.rng_state_loader.html +++ b/docs/api/core.trainers.mixins.rng_state_loader.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainers.mixins.scheduler.html b/docs/api/core.trainers.mixins.scheduler.html index d44647ea3..1986eb997 100644 --- a/docs/api/core.trainers.mixins.scheduler.html +++ b/docs/api/core.trainers.mixins.scheduler.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainers.relora.html b/docs/api/core.trainers.relora.html index 08f4716f2..f18ef276b 100644 --- a/docs/api/core.trainers.relora.html +++ b/docs/api/core.trainers.relora.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainers.trl.html b/docs/api/core.trainers.trl.html index 9040d90bc..78ec00788 100644 --- a/docs/api/core.trainers.trl.html +++ b/docs/api/core.trainers.trl.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/core.trainers.utils.html b/docs/api/core.trainers.utils.html index d46c37517..a047a8560 100644 --- a/docs/api/core.trainers.utils.html +++ b/docs/api/core.trainers.utils.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/core.training_args.html b/docs/api/core.training_args.html index ffb25b909..6203a55fd 100644 --- a/docs/api/core.training_args.html +++ b/docs/api/core.training_args.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/datasets.html b/docs/api/datasets.html index 97cfcc817..0860082ae 100644 --- a/docs/api/datasets.html +++ b/docs/api/datasets.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/evaluate.html b/docs/api/evaluate.html index e25fd85b5..0c32dbe75 100644 --- a/docs/api/evaluate.html +++ b/docs/api/evaluate.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/index.html b/docs/api/index.html index 3f1f280c0..bd27dc278 100644 --- a/docs/api/index.html +++ b/docs/api/index.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/integrations.base.html b/docs/api/integrations.base.html index 7024243d2..a49b2fa54 100644 --- a/docs/api/integrations.base.html +++ b/docs/api/integrations.base.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/integrations.cut_cross_entropy.args.html b/docs/api/integrations.cut_cross_entropy.args.html index d56425caf..09d942575 100644 --- a/docs/api/integrations.cut_cross_entropy.args.html +++ b/docs/api/integrations.cut_cross_entropy.args.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/integrations.grokfast.optimizer.html b/docs/api/integrations.grokfast.optimizer.html index 73cd04f6b..870a67f87 100644 --- a/docs/api/integrations.grokfast.optimizer.html +++ b/docs/api/integrations.grokfast.optimizer.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/integrations.kd.trainer.html b/docs/api/integrations.kd.trainer.html index 7f6b9a1ae..da64fe96f 100644 --- a/docs/api/integrations.kd.trainer.html +++ b/docs/api/integrations.kd.trainer.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/integrations.liger.args.html b/docs/api/integrations.liger.args.html index 7241e3637..ca3f381b1 100644 --- a/docs/api/integrations.liger.args.html +++ b/docs/api/integrations.liger.args.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/integrations.lm_eval.args.html b/docs/api/integrations.lm_eval.args.html index f5b2bc0e6..ab8fb5b57 100644 --- a/docs/api/integrations.lm_eval.args.html +++ b/docs/api/integrations.lm_eval.args.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/integrations.spectrum.args.html b/docs/api/integrations.spectrum.args.html index 41c5c6d83..84e65fc63 100644 --- a/docs/api/integrations.spectrum.args.html +++ b/docs/api/integrations.spectrum.args.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/kernels.geglu.html b/docs/api/kernels.geglu.html index 7b4b32b66..86d89cd89 100644 --- a/docs/api/kernels.geglu.html +++ b/docs/api/kernels.geglu.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/kernels.lora.html b/docs/api/kernels.lora.html index 17c6f540a..19f6cd11a 100644 --- a/docs/api/kernels.lora.html +++ b/docs/api/kernels.lora.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/kernels.quantize.html b/docs/api/kernels.quantize.html index 2fdb66b46..34b2b831b 100644 --- a/docs/api/kernels.quantize.html +++ b/docs/api/kernels.quantize.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/kernels.swiglu.html b/docs/api/kernels.swiglu.html index 629ceb633..550456daf 100644 --- a/docs/api/kernels.swiglu.html +++ b/docs/api/kernels.swiglu.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/kernels.utils.html b/docs/api/kernels.utils.html index 467b5d545..524ad9899 100644 --- a/docs/api/kernels.utils.html +++ b/docs/api/kernels.utils.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/loaders.adapter.html b/docs/api/loaders.adapter.html index 8d26b4cff..39930df98 100644 --- a/docs/api/loaders.adapter.html +++ b/docs/api/loaders.adapter.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/loaders.constants.html b/docs/api/loaders.constants.html index 41a6838bf..303dac8ad 100644 --- a/docs/api/loaders.constants.html +++ b/docs/api/loaders.constants.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/loaders.model.html b/docs/api/loaders.model.html index 32bbadc89..9e54402b9 100644 --- a/docs/api/loaders.model.html +++ b/docs/api/loaders.model.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/loaders.patch_manager.html b/docs/api/loaders.patch_manager.html index 37d00f0db..1131b77ad 100644 --- a/docs/api/loaders.patch_manager.html +++ b/docs/api/loaders.patch_manager.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/loaders.processor.html b/docs/api/loaders.processor.html index dc8a0bf67..56226f6af 100644 --- a/docs/api/loaders.processor.html +++ b/docs/api/loaders.processor.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/loaders.tokenizer.html b/docs/api/loaders.tokenizer.html index 6e70ab7f3..c6f3f07f9 100644 --- a/docs/api/loaders.tokenizer.html +++ b/docs/api/loaders.tokenizer.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/logging_config.html b/docs/api/logging_config.html index 773c1c26f..cdf730f50 100644 --- a/docs/api/logging_config.html +++ b/docs/api/logging_config.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/models.mamba.modeling_mamba.html b/docs/api/models.mamba.modeling_mamba.html index 62b1c07e3..b7f7a8f63 100644 --- a/docs/api/models.mamba.modeling_mamba.html +++ b/docs/api/models.mamba.modeling_mamba.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.attention.mllama.html b/docs/api/monkeypatch.attention.mllama.html index 95086804a..92e2871eb 100644 --- a/docs/api/monkeypatch.attention.mllama.html +++ b/docs/api/monkeypatch.attention.mllama.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.btlm_attn_hijack_flash.html b/docs/api/monkeypatch.btlm_attn_hijack_flash.html index 072611c66..f81e1f765 100644 --- a/docs/api/monkeypatch.btlm_attn_hijack_flash.html +++ b/docs/api/monkeypatch.btlm_attn_hijack_flash.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.data.batch_dataset_fetcher.html b/docs/api/monkeypatch.data.batch_dataset_fetcher.html index 96906fda7..98e96be49 100644 --- a/docs/api/monkeypatch.data.batch_dataset_fetcher.html +++ b/docs/api/monkeypatch.data.batch_dataset_fetcher.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html b/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html index 734e1ba7b..9757abcc6 100644 --- a/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html +++ b/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html b/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html index 04b94725c..85531d286 100644 --- a/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html +++ b/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.llama_attn_hijack_flash.html b/docs/api/monkeypatch.llama_attn_hijack_flash.html index 4488fb361..ecd41351e 100644 --- a/docs/api/monkeypatch.llama_attn_hijack_flash.html +++ b/docs/api/monkeypatch.llama_attn_hijack_flash.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.llama_attn_hijack_xformers.html b/docs/api/monkeypatch.llama_attn_hijack_xformers.html index 16d851304..c6c551659 100644 --- a/docs/api/monkeypatch.llama_attn_hijack_xformers.html +++ b/docs/api/monkeypatch.llama_attn_hijack_xformers.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.llama_expand_mask.html b/docs/api/monkeypatch.llama_expand_mask.html index e2c45dbc9..c3d50ce3f 100644 --- a/docs/api/monkeypatch.llama_expand_mask.html +++ b/docs/api/monkeypatch.llama_expand_mask.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.llama_patch_multipack.html b/docs/api/monkeypatch.llama_patch_multipack.html index 4fbd27b09..e020544f4 100644 --- a/docs/api/monkeypatch.llama_patch_multipack.html +++ b/docs/api/monkeypatch.llama_patch_multipack.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.lora_kernels.html b/docs/api/monkeypatch.lora_kernels.html index 3933bb6a6..726975d9a 100644 --- a/docs/api/monkeypatch.lora_kernels.html +++ b/docs/api/monkeypatch.lora_kernels.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.mistral_attn_hijack_flash.html b/docs/api/monkeypatch.mistral_attn_hijack_flash.html index 8dfd1e1ac..79b70bef7 100644 --- a/docs/api/monkeypatch.mistral_attn_hijack_flash.html +++ b/docs/api/monkeypatch.mistral_attn_hijack_flash.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.mixtral.html b/docs/api/monkeypatch.mixtral.html index 8ceda9492..181b4e95f 100644 --- a/docs/api/monkeypatch.mixtral.html +++ b/docs/api/monkeypatch.mixtral.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.multipack.html b/docs/api/monkeypatch.multipack.html index 0dcc83711..e1f27acd9 100644 --- a/docs/api/monkeypatch.multipack.html +++ b/docs/api/monkeypatch.multipack.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.relora.html b/docs/api/monkeypatch.relora.html index d7e9192d4..c7b6ab932 100644 --- a/docs/api/monkeypatch.relora.html +++ b/docs/api/monkeypatch.relora.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.stablelm_attn_hijack_flash.html b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html index cea06a524..a4691d031 100644 --- a/docs/api/monkeypatch.stablelm_attn_hijack_flash.html +++ b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.trainer_fsdp_optim.html b/docs/api/monkeypatch.trainer_fsdp_optim.html index 87dbe0fc4..7261189b3 100644 --- a/docs/api/monkeypatch.trainer_fsdp_optim.html +++ b/docs/api/monkeypatch.trainer_fsdp_optim.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.transformers_fa_utils.html b/docs/api/monkeypatch.transformers_fa_utils.html index 5256993da..7dc0a36bc 100644 --- a/docs/api/monkeypatch.transformers_fa_utils.html +++ b/docs/api/monkeypatch.transformers_fa_utils.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.unsloth_.html b/docs/api/monkeypatch.unsloth_.html index 8e977b026..275653a2b 100644 --- a/docs/api/monkeypatch.unsloth_.html +++ b/docs/api/monkeypatch.unsloth_.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/monkeypatch.utils.html b/docs/api/monkeypatch.utils.html index 2f3cac299..6e1e02c7e 100644 --- a/docs/api/monkeypatch.utils.html +++ b/docs/api/monkeypatch.utils.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.alpaca_chat.html b/docs/api/prompt_strategies.alpaca_chat.html index 35234f2bf..7d8b6f585 100644 --- a/docs/api/prompt_strategies.alpaca_chat.html +++ b/docs/api/prompt_strategies.alpaca_chat.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.alpaca_instruct.html b/docs/api/prompt_strategies.alpaca_instruct.html index 79a52e06b..b4fc0eb55 100644 --- a/docs/api/prompt_strategies.alpaca_instruct.html +++ b/docs/api/prompt_strategies.alpaca_instruct.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.alpaca_w_system.html b/docs/api/prompt_strategies.alpaca_w_system.html index 3c70ca2a6..f11c6610b 100644 --- a/docs/api/prompt_strategies.alpaca_w_system.html +++ b/docs/api/prompt_strategies.alpaca_w_system.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.base.html b/docs/api/prompt_strategies.base.html index 7e5e14766..58a8a9ae1 100644 --- a/docs/api/prompt_strategies.base.html +++ b/docs/api/prompt_strategies.base.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.bradley_terry.llama3.html b/docs/api/prompt_strategies.bradley_terry.llama3.html index 42d2e0798..6148d8b5e 100644 --- a/docs/api/prompt_strategies.bradley_terry.llama3.html +++ b/docs/api/prompt_strategies.bradley_terry.llama3.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.chat_template.html b/docs/api/prompt_strategies.chat_template.html index bc2ead48d..101971935 100644 --- a/docs/api/prompt_strategies.chat_template.html +++ b/docs/api/prompt_strategies.chat_template.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.completion.html b/docs/api/prompt_strategies.completion.html index 89cbd8a33..10b2053bf 100644 --- a/docs/api/prompt_strategies.completion.html +++ b/docs/api/prompt_strategies.completion.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.dpo.chat_template.html b/docs/api/prompt_strategies.dpo.chat_template.html index 9ce5ad025..ad6e41e3a 100644 --- a/docs/api/prompt_strategies.dpo.chat_template.html +++ b/docs/api/prompt_strategies.dpo.chat_template.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.dpo.chatml.html b/docs/api/prompt_strategies.dpo.chatml.html index 39e5ad006..3e3d452b2 100644 --- a/docs/api/prompt_strategies.dpo.chatml.html +++ b/docs/api/prompt_strategies.dpo.chatml.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.dpo.llama3.html b/docs/api/prompt_strategies.dpo.llama3.html index 047e40751..5cbf8c909 100644 --- a/docs/api/prompt_strategies.dpo.llama3.html +++ b/docs/api/prompt_strategies.dpo.llama3.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.dpo.passthrough.html b/docs/api/prompt_strategies.dpo.passthrough.html index 3b1264d30..a2b59ea38 100644 --- a/docs/api/prompt_strategies.dpo.passthrough.html +++ b/docs/api/prompt_strategies.dpo.passthrough.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.dpo.user_defined.html b/docs/api/prompt_strategies.dpo.user_defined.html index 92f691d7e..6db1629d0 100644 --- a/docs/api/prompt_strategies.dpo.user_defined.html +++ b/docs/api/prompt_strategies.dpo.user_defined.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.dpo.zephyr.html b/docs/api/prompt_strategies.dpo.zephyr.html index f8b304d12..5970aae3a 100644 --- a/docs/api/prompt_strategies.dpo.zephyr.html +++ b/docs/api/prompt_strategies.dpo.zephyr.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.input_output.html b/docs/api/prompt_strategies.input_output.html index c1f3fd08c..9aa30c080 100644 --- a/docs/api/prompt_strategies.input_output.html +++ b/docs/api/prompt_strategies.input_output.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.kto.chatml.html b/docs/api/prompt_strategies.kto.chatml.html index d7fe3b385..79d22d8ae 100644 --- a/docs/api/prompt_strategies.kto.chatml.html +++ b/docs/api/prompt_strategies.kto.chatml.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.kto.llama3.html b/docs/api/prompt_strategies.kto.llama3.html index 25478700f..08d592ec8 100644 --- a/docs/api/prompt_strategies.kto.llama3.html +++ b/docs/api/prompt_strategies.kto.llama3.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.kto.user_defined.html b/docs/api/prompt_strategies.kto.user_defined.html index e68727f5e..f00134085 100644 --- a/docs/api/prompt_strategies.kto.user_defined.html +++ b/docs/api/prompt_strategies.kto.user_defined.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.llama2_chat.html b/docs/api/prompt_strategies.llama2_chat.html index f015f61df..fceeaebb0 100644 --- a/docs/api/prompt_strategies.llama2_chat.html +++ b/docs/api/prompt_strategies.llama2_chat.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.messages.chat.html b/docs/api/prompt_strategies.messages.chat.html index ac5906975..b6c2900ac 100644 --- a/docs/api/prompt_strategies.messages.chat.html +++ b/docs/api/prompt_strategies.messages.chat.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.metharme.html b/docs/api/prompt_strategies.metharme.html index 5ef56c7f6..26b7e52c9 100644 --- a/docs/api/prompt_strategies.metharme.html +++ b/docs/api/prompt_strategies.metharme.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.orcamini.html b/docs/api/prompt_strategies.orcamini.html index 6fd961ac6..060059688 100644 --- a/docs/api/prompt_strategies.orcamini.html +++ b/docs/api/prompt_strategies.orcamini.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.orpo.chat_template.html b/docs/api/prompt_strategies.orpo.chat_template.html index 54b302f29..da197fa89 100644 --- a/docs/api/prompt_strategies.orpo.chat_template.html +++ b/docs/api/prompt_strategies.orpo.chat_template.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.pygmalion.html b/docs/api/prompt_strategies.pygmalion.html index cff046e57..de5eaf61c 100644 --- a/docs/api/prompt_strategies.pygmalion.html +++ b/docs/api/prompt_strategies.pygmalion.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.stepwise_supervised.html b/docs/api/prompt_strategies.stepwise_supervised.html index 83db292e0..a5e2b6c8d 100644 --- a/docs/api/prompt_strategies.stepwise_supervised.html +++ b/docs/api/prompt_strategies.stepwise_supervised.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_strategies.user_defined.html b/docs/api/prompt_strategies.user_defined.html index 7bdcb5ec1..ad1474676 100644 --- a/docs/api/prompt_strategies.user_defined.html +++ b/docs/api/prompt_strategies.user_defined.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/prompt_tokenizers.html b/docs/api/prompt_tokenizers.html index b59385a81..00571c647 100644 --- a/docs/api/prompt_tokenizers.html +++ b/docs/api/prompt_tokenizers.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/train.html b/docs/api/train.html index b30a903bb..bfaf9e7b7 100644 --- a/docs/api/train.html +++ b/docs/api/train.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.bench.html b/docs/api/utils.bench.html index 6166453b7..8a128fe88 100644 --- a/docs/api/utils.bench.html +++ b/docs/api/utils.bench.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.callbacks.comet_.html b/docs/api/utils.callbacks.comet_.html index 130de2279..95eb7cca0 100644 --- a/docs/api/utils.callbacks.comet_.html +++ b/docs/api/utils.callbacks.comet_.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.callbacks.lisa.html b/docs/api/utils.callbacks.lisa.html index c16f16824..188e59d14 100644 --- a/docs/api/utils.callbacks.lisa.html +++ b/docs/api/utils.callbacks.lisa.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/utils.callbacks.mlflow_.html b/docs/api/utils.callbacks.mlflow_.html index 787c910c5..de03c82c1 100644 --- a/docs/api/utils.callbacks.mlflow_.html +++ b/docs/api/utils.callbacks.mlflow_.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.callbacks.perplexity.html b/docs/api/utils.callbacks.perplexity.html index 2da6caf61..cbe217324 100644 --- a/docs/api/utils.callbacks.perplexity.html +++ b/docs/api/utils.callbacks.perplexity.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.callbacks.profiler.html b/docs/api/utils.callbacks.profiler.html index 12fe467a0..5899a27af 100644 --- a/docs/api/utils.callbacks.profiler.html +++ b/docs/api/utils.callbacks.profiler.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.chat_templates.html b/docs/api/utils.chat_templates.html index 92ac1bde8..f5522857c 100644 --- a/docs/api/utils.chat_templates.html +++ b/docs/api/utils.chat_templates.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.collators.batching.html b/docs/api/utils.collators.batching.html index eff8da679..48ae88c13 100644 --- a/docs/api/utils.collators.batching.html +++ b/docs/api/utils.collators.batching.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.collators.core.html b/docs/api/utils.collators.core.html index 37c0c7fd9..17f2e65ee 100644 --- a/docs/api/utils.collators.core.html +++ b/docs/api/utils.collators.core.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/utils.collators.mamba.html b/docs/api/utils.collators.mamba.html index bb624203b..002f1f6d1 100644 --- a/docs/api/utils.collators.mamba.html +++ b/docs/api/utils.collators.mamba.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.collators.mm_chat.html b/docs/api/utils.collators.mm_chat.html index ff0e41a3b..6b02e5891 100644 --- a/docs/api/utils.collators.mm_chat.html +++ b/docs/api/utils.collators.mm_chat.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.ctx_managers.sequence_parallel.html b/docs/api/utils.ctx_managers.sequence_parallel.html index 882fe9955..85f7f5f72 100644 --- a/docs/api/utils.ctx_managers.sequence_parallel.html +++ b/docs/api/utils.ctx_managers.sequence_parallel.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.data.pretraining.html b/docs/api/utils.data.pretraining.html index 456a3bea7..a87f64d70 100644 --- a/docs/api/utils.data.pretraining.html +++ b/docs/api/utils.data.pretraining.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/utils.data.sft.html b/docs/api/utils.data.sft.html index aa14ce563..97d626913 100644 --- a/docs/api/utils.data.sft.html +++ b/docs/api/utils.data.sft.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/api/utils.dict.html b/docs/api/utils.dict.html index 1a29a4ed4..cd5fde067 100644 --- a/docs/api/utils.dict.html +++ b/docs/api/utils.dict.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.distributed.html b/docs/api/utils.distributed.html index a9a58b4a2..4d3783610 100644 --- a/docs/api/utils.distributed.html +++ b/docs/api/utils.distributed.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.freeze.html b/docs/api/utils.freeze.html index 7c38d9a7e..977c93930 100644 --- a/docs/api/utils.freeze.html +++ b/docs/api/utils.freeze.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.lora.html b/docs/api/utils.lora.html index 846e8e235..95360edb3 100644 --- a/docs/api/utils.lora.html +++ b/docs/api/utils.lora.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.model_shard_quant.html b/docs/api/utils.model_shard_quant.html index 7061cf3ce..7e35f9e53 100644 --- a/docs/api/utils.model_shard_quant.html +++ b/docs/api/utils.model_shard_quant.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.optimizers.adopt.html b/docs/api/utils.optimizers.adopt.html index 015d5262d..56d078798 100644 --- a/docs/api/utils.optimizers.adopt.html +++ b/docs/api/utils.optimizers.adopt.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.samplers.multipack.html b/docs/api/utils.samplers.multipack.html index a5ffd5839..81fe30840 100644 --- a/docs/api/utils.samplers.multipack.html +++ b/docs/api/utils.samplers.multipack.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.schedulers.html b/docs/api/utils.schedulers.html index 596102a7a..7b3f49f50 100644 --- a/docs/api/utils.schedulers.html +++ b/docs/api/utils.schedulers.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.schemas.config.html b/docs/api/utils.schemas.config.html index 1553b60ff..565789e22 100644 --- a/docs/api/utils.schemas.config.html +++ b/docs/api/utils.schemas.config.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.schemas.datasets.html b/docs/api/utils.schemas.datasets.html index 2925f1560..90e470980 100644 --- a/docs/api/utils.schemas.datasets.html +++ b/docs/api/utils.schemas.datasets.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.schemas.enums.html b/docs/api/utils.schemas.enums.html index 2cc686741..28a58463e 100644 --- a/docs/api/utils.schemas.enums.html +++ b/docs/api/utils.schemas.enums.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.schemas.integrations.html b/docs/api/utils.schemas.integrations.html index a3e5d128e..d3718a799 100644 --- a/docs/api/utils.schemas.integrations.html +++ b/docs/api/utils.schemas.integrations.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.schemas.model.html b/docs/api/utils.schemas.model.html index e69675b33..033c329ab 100644 --- a/docs/api/utils.schemas.model.html +++ b/docs/api/utils.schemas.model.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.schemas.multimodal.html b/docs/api/utils.schemas.multimodal.html index 0eb179e08..e94311d14 100644 --- a/docs/api/utils.schemas.multimodal.html +++ b/docs/api/utils.schemas.multimodal.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.schemas.peft.html b/docs/api/utils.schemas.peft.html index 9c8ba8da1..53aa95121 100644 --- a/docs/api/utils.schemas.peft.html +++ b/docs/api/utils.schemas.peft.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.schemas.training.html b/docs/api/utils.schemas.training.html index a7420a39d..e3fd0b5b2 100644 --- a/docs/api/utils.schemas.training.html +++ b/docs/api/utils.schemas.training.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.schemas.trl.html b/docs/api/utils.schemas.trl.html index 7ae16e6c9..d81298a50 100644 --- a/docs/api/utils.schemas.trl.html +++ b/docs/api/utils.schemas.trl.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.schemas.utils.html b/docs/api/utils.schemas.utils.html index 97559bf9f..a514d9dd3 100644 --- a/docs/api/utils.schemas.utils.html +++ b/docs/api/utils.schemas.utils.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.tokenization.html b/docs/api/utils.tokenization.html index 60146f8ce..2032a5320 100644 --- a/docs/api/utils.tokenization.html +++ b/docs/api/utils.tokenization.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/api/utils.trainer.html b/docs/api/utils.trainer.html index dbb4d626d..9bf19d12a 100644 --- a/docs/api/utils.trainer.html +++ b/docs/api/utils.trainer.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/batch_vs_grad.html b/docs/batch_vs_grad.html index 864e16f74..ed5e8dffd 100644 --- a/docs/batch_vs_grad.html +++ b/docs/batch_vs_grad.html @@ -68,6 +68,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/cli.html b/docs/cli.html index 6f0bc3385..f0666c9ef 100644 --- a/docs/cli.html +++ b/docs/cli.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/config.html b/docs/config.html index 4459e81c3..51c39aa0a 100644 --- a/docs/config.html +++ b/docs/config.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + @@ -566,651 +575,685 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin # A list of one or more datasets to finetune the model with -datasets: - # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files - - path: vicgalle/alpaca-gpt4 - # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection] - type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn> - ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file - data_files: # Optional[str] path to source data files - - shards: # Optional[int] split dataset into N pieces (use with shards_idx) - shards_idx: # Optional[int] = 0 the index of sharded dataset to use - - preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`) +# See https://docs.axolotl.ai/docs/dataset_loading.html for guide on loading datasets +# See https://docs.axolotl.ai/docs/dataset-formats/ for guide on dataset formats +datasets: + # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory + - path: vicgalle/alpaca-gpt4 + # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection] + type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn> + ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file + data_files: # Optional[str] path to source data files + + shards: # Optional[int] split dataset into N pieces (use with shards_idx) + shards_idx: # Optional[int] = 0 the index of sharded dataset to use - name: # Optional[str] name of dataset configuration to load - split: train # Optional[str] name of dataset split to load from - revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets. - trust_remote_code: # Optional[bool] Trust remote code for untrusted source - - # Custom user instruction prompt - - path: repo - type: - # The below are defaults. only set what's needed if you use a different column name. - system_prompt: "" - system_format: "{system}" - field_system: system - field_instruction: instruction - field_input: input - field_output: output - - # Customizable to be single line or multi-line - # Use {instruction}/{input} as key to be replaced - # 'format' can include {input} - format: |- - User: {instruction} {input} - Assistant: - # 'no_input_format' cannot include {input} - no_input_format: "{instruction} " - - # For `completion` datsets only, uses the provided field instead of `text` column - field: - - # Using chat template - - path: ... - # Set type to `chat_template` to use this strategy - type: chat_template - # Specify the name of the chat template to use - # The name of the chat template to use for training, following values are supported: - # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default. - # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py - # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml. - # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field. - chat_template: tokenizer_default - - # Custom jinja chat template. Used only if `chat_template: jinja` or empty. - chat_template_jinja: - - # Key containing the messages (default: "messages") - field_messages: messages - - # Key containing the system message (default: "system") - # If the system message is not present in the dataset sample, it will be loaded from the field_system property. - field_system: system - - # Mapping of properties from the input dataset to the chat template. - # (default: message_property_mappings={'role':'role', 'content':'content'}) - # If a property exists in the template but not in this mapping, the system will attempt - # to load it directly from the message using the property name as the key. - # Example: In the mapping below, 'from' is loaded from input dataset and used as 'role', - # while 'value' is loaded and used as 'content' in the chat template. - message_property_mappings: - role: from - content: value - # ... - - # Optional[Dict[str, List]]. Roles mapping in the messages. - # The format is {target_role: [source_roles]}. All source roles will be mapped to the target role. - # The default is: - roles: - user: ["human", "user"] - assistant: ["gpt", "assistant"] - system: ["system"] - tool: ["tool"] - - # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template. - # This does not drop the default system message from chat_template if it exists. If you wish to, - # we recommend using a custom jinja template with the default system message removed or - # adding a system turn with empty content. - drop_system_message: - - # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags - # See example at `docs/dataset-formats/conversation.qmd` - split_thinking: - - # IMPORTANT: The following fields determine which parts of the conversation to train on. - # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train - # See examples at `docs/dataset-formats/conversation.qmd` - # Note: If the below 5 fields are empty, defaults to training only on the last message. - - # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss. - roles_to_train: ["assistant"] # default - # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are: - # - all: train on all EOS tokens - # - turn (default): train on the EOS token at the end of each trainable turn - # - last: train on the last EOS token in the conversation - # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`. - train_on_eos: turn - # Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are: - # - all: train on all EOT tokens - # - turn: train on the EOT token at the end of each trainable turn - # - last: train on the last EOT token in the conversation - # If not specified, defaults to the value of train_on_eos for backward compatibility. - train_on_eot: - # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`. - message_field_training: training - # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn. - # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train). - message_field_training_detail: train_detail - - -# If false, the datasets will not be shuffled and will keep their original order in `datasets`. -# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true. -shuffle_merged_datasets: true - -Deduplicates datasets and test_datasets with identical entries. -dataset_exact_deduplication: true - -# A list of one or more datasets to eval the model with. -# You can use either test_datasets, or val_set_size, but not both. -test_datasets: - - path: /workspace/data/eval.jsonl - ds_type: json - # You need to specify a split. For "json" datasets the default split is called "train". - split: train - type: completion - data_files: - - /workspace/data/eval.jsonl - -# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo' -rl: -rl_beta: # Optional[float]. The beta parameter for the RL training. - -# dpo -dpo_use_weighting: # Optional[bool]. Whether to perform weighting. -rpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper. - -# orpo -orpo_alpha: 0.1 # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping. - -# kto -kto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss. -kto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss. - -# simpo -cpo_alpha: 1.0 # Weight of the BC regularizer -simpo_gamma: 0.5 # Target reward margin for the SimPO loss - -# grpo -trl: - use_vllm: # Optional[bool]. Whether to use VLLM for RL training. - vllm_server_host: # Optional[str]. Host of the vLLM server to connect to. - vllm_server_port: # Optional[int]. Port of the vLLM server to connect to. - vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond. - vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding. - - beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use - max_completion_length: # Optional[int]. Maximum length of the completion for RL training. - - reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir. - reward_weights: # Optional[list[float]]. List of reward weights for the reward functions. - - num_generations: # Optional[int]. Number of generations to sample. - log_completions: # Optional[bool]. Whether to log completions. - - sync_ref_model: # Optional[bool]. Whether to sync the reference model. - ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model. - ref_model_sync_steps: # Optional[int]. Sync steps for the reference model. - - -# reward modelling: `True` or `False` -reward_model: + preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`) + + name: # Optional[str] name of dataset configuration to load + split: train # Optional[str] name of dataset split to load from + revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets. + trust_remote_code: # Optional[bool] Trust remote code for untrusted source + + # Custom user instruction prompt + - path: repo + type: + # The below are defaults. only set what's needed if you use a different column name. + system_prompt: "" + system_format: "{system}" + field_system: system + field_instruction: instruction + field_input: input + field_output: output + + # Customizable to be single line or multi-line + # Use {instruction}/{input} as key to be replaced + # 'format' can include {input} + format: |- + User: {instruction} {input} + Assistant: + # 'no_input_format' cannot include {input} + no_input_format: "{instruction} " + + # For `completion` datsets only, uses the provided field instead of `text` column + field: + + # Using chat template + - path: ... + # Set type to `chat_template` to use this strategy + type: chat_template + # Specify the name of the chat template to use + # The name of the chat template to use for training, following values are supported: + # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default. + # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py + # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml. + # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field. + chat_template: tokenizer_default + + # Custom jinja chat template. Used only if `chat_template: jinja` or empty. + chat_template_jinja: + + # Key containing the messages (default: "messages") + field_messages: messages + + # Key containing the system message (default: "system") + # If the system message is not present in the dataset sample, it will be loaded from the field_system property. + field_system: system + + # Mapping of properties from the input dataset to the chat template. + # (default: message_property_mappings={'role':'role', 'content':'content'}) + # If a property exists in the template but not in this mapping, the system will attempt + # to load it directly from the message using the property name as the key. + # Example: In the mapping below, 'from' is loaded from input dataset and used as 'role', + # while 'value' is loaded and used as 'content' in the chat template. + message_property_mappings: + role: from + content: value + # ... + + # Optional[Dict[str, List]]. Roles mapping in the messages. + # The format is {target_role: [source_roles]}. All source roles will be mapped to the target role. + # The default is: + roles: + user: ["human", "user"] + assistant: ["gpt", "assistant"] + system: ["system"] + tool: ["tool"] + + # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template. + # This does not drop the default system message from chat_template if it exists. If you wish to, + # we recommend using a custom jinja template with the default system message removed or + # adding a system turn with empty content. + drop_system_message: + + # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags + # See example at `docs/dataset-formats/conversation.qmd` + split_thinking: + + # IMPORTANT: The following fields determine which parts of the conversation to train on. + # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train + # See examples at `docs/dataset-formats/conversation.qmd` + # Note: If the below 5 fields are empty, defaults to training only on the last message. + + # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss. + roles_to_train: ["assistant"] # default + # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are: + # - all: train on all EOS tokens + # - turn (default): train on the EOS token at the end of each trainable turn + # - last: train on the last EOS token in the conversation + # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`. + train_on_eos: turn + # Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are: + # - all: train on all EOT tokens + # - turn: train on the EOT token at the end of each trainable turn + # - last: train on the last EOT token in the conversation + # If not specified, defaults to the value of train_on_eos for backward compatibility. + train_on_eot: + # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`. + message_field_training: training + # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn. + # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train). + message_field_training_detail: train_detail + + +# If false, the datasets will not be shuffled and will keep their original order in `datasets`. +# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true. +shuffle_merged_datasets: true + +# Deduplicates datasets and test_datasets with identical entries. +dataset_exact_deduplication: true + +# A list of one or more datasets to eval the model with. +# You can use either test_datasets, or val_set_size, but not both. +test_datasets: + - path: /workspace/data/eval.jsonl + ds_type: json + # You need to specify a split. For "json" datasets the default split is called "train". + split: train + type: completion + data_files: + - /workspace/data/eval.jsonl + +# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo' +rl: +rl_beta: # Optional[float]. The beta parameter for the RL training. + +# dpo +dpo_use_weighting: # Optional[bool]. Whether to perform weighting. +rpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper. + +# orpo +orpo_alpha: 0.1 # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping. + +# kto +kto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss. +kto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss. + +# simpo +cpo_alpha: 1.0 # Weight of the BC regularizer +simpo_gamma: 0.5 # Target reward margin for the SimPO loss + +# grpo +trl: + use_vllm: # Optional[bool]. Whether to use VLLM for RL training. + vllm_server_host: # Optional[str]. Host of the vLLM server to connect to. + vllm_server_port: # Optional[int]. Port of the vLLM server to connect to. + vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond. + vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding. + + beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use + max_completion_length: # Optional[int]. Maximum length of the completion for RL training. + + reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir. + reward_weights: # Optional[list[float]]. List of reward weights for the reward functions. + + num_generations: # Optional[int]. Number of generations to sample. + log_completions: # Optional[bool]. Whether to log completions. + num_completions_to_print: # Optional[int]. Number of completions to print when log_completions is True. + + sync_ref_model: # Optional[bool]. Whether to sync the reference model. + ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model. + ref_model_sync_steps: # Optional[int]. Sync steps for the reference model. + scale_rewards: # Optional[bool]. Whether to scale rewards by their standard deviation. -# process reward modelling: `True` or `False` -process_reward_model: - -# The name of the chat template to use for training, following values are supported: -# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value. -# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py -# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer. -# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field. -# The selected chat template will be saved to the tokenizer_config.json for easier inferencing -# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template. -chat_template: tokenizer_default -# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null. -chat_template_jinja: null -# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training. -# These tokens mark the boundaries between conversation turns. -# For example: ["/INST", "</s>", "[/SYSTEM_PROMPT]"] -# If not specified, defaults to just the model's eos_token. -# This is useful for templates that use multiple delimiter tokens. -eot_tokens: - # - "</s>" - # - "[/INST]" - # - "[/SYSTEM_PROMPT]" -# Changes the default system message -default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml. -# Axolotl attempts to save the dataset as an arrow after packing the data together so -# subsequent training attempts load faster, relative path -dataset_prepared_path: data/last_run_prepared -# Push prepared dataset to hub -push_dataset_to_hub: # Optional[str] repo_org/repo_name -# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` -# if not set. -dataset_processes: # defaults to os.cpu_count() if not set -# Keep dataset in memory while preprocessing -# Only needed if cached dataset is taking too much storage -dataset_keep_in_memory: -# push checkpoints to hub -hub_model_id: # private repo path to push finetuned model -# how to push checkpoints to hub -# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy -hub_strategy: -# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets -# Required to be true when used in combination with `push_dataset_to_hub` -hf_use_auth_token: # boolean -# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval. -val_set_size: 0.04 -# Num shards for whole dataset -dataset_shard_num: -# Index of shard to use for whole dataset -dataset_shard_idx: - -# The maximum length of an input to train with, this should typically be less than 2048 -# as most models have a token/context limit of 2048 -sequence_len: 2048 -# Pad inputs so each step uses constant sized buffers -# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently -pad_to_sequence_len: -# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true' -sample_packing: -# Set to 'false' if getting errors during eval with sample_packing on. -eval_sample_packing: -# You can set these packing optimizations AFTER starting a training at least once. -# The trainer will provide recommended values for these values. -sample_packing_eff_est: -total_num_tokens: -# Increasing the following values helps with packing, but usually only slightly (<%1.) -# The number of samples packed at a time. -sample_packing_group_size: 100000 -# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples. -sample_packing_bin_size: 200 -sample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially. - -# whether to concatenate samples during pretraining -pretraining_sample_concatenation: - -curriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning - -# Use batch flattening for speedups when not using sample_packing -batch_flattening: - -# Passed through to transformers when loading the model when launched without accelerate -# Use `sequential` when training w/ model parallelism to limit memory -device_map: -# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model. -max_memory: - -# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model -adapter: lora -# If you already have a lora model trained that you want to load, put that here. -# This means after training, if you want to test the model, you should set this to the value of `output_dir`. -# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`. -lora_model_dir: - -# LoRA hyperparameters -# For more details about the following options, see: -# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2 -lora_r: 8 -lora_alpha: 16 -lora_dropout: 0.05 -lora_target_modules: - - q_proj - - v_proj -# - k_proj -# - o_proj -# - gate_proj -# - down_proj -# - up_proj -lora_target_linear: # If true, will target all linear modules - -# List[int] | int. # The layer indices to transform, otherwise, apply to all layers -# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform -peft_layers_to_transform: - -# Optional[bool]. Whether to use DoRA. -# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora -peft_use_dora: - -# Optional[bool]. Whether to use RSLoRA. -# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora -peft_use_rslora: - -# Optional[list[tuple[int, int]]]. List of layer indices to replicate. -# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora -peft_layer_replication: - -# bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq"] -# How to initialize LoRA weights. Default to True which is MS original implementation. -# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization -peft_init_lora_weights: + temperature: # Optional[float]. Sampling temperature for the GRPO policy. + top_p: # Optional[float]. Top-p sampling probability for the generation policy. + top_k: # Optional[int]. Top-k sampling for the generation policy. + min_p: # Optional[float]. Minimum probability for the generation policy. + repetition_penalty: # Optional[float]. Penalty for tokens that appear in prompt and generated text. + + num_iterations: # Optional[int]. Number of iterations per batch (μ) for GRPO. + epsilon: # Optional[float]. Epsilon value for clipping in the GRPO algorithm. + epsilon_high: # Optional[float]. Upper-bound epsilon value for clipping in the GRPO algorithm. + use_liger_loss: # Optional[bool]. Whether to use Liger loss for GRPO. + loss_type: # Optional[str]. Loss formulation to use. Supported values: grpo, bnpo, dr_grpo. + mask_truncated_completions: # Optional[bool]. Whether to exclude truncated completions from loss calculation. + + +# reward modelling: `True` or `False` +reward_model: + +# process reward modelling: `True` or `False` +process_reward_model: + +# The name of the chat template to use for training, following values are supported: +# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value. +# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py +# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer. +# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field. +# The selected chat template will be saved to the tokenizer_config.json for easier inferencing +# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template. +chat_template: tokenizer_default +# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null. +chat_template_jinja: null +# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training. +# These tokens mark the boundaries between conversation turns. +# For example: ["/INST", "</s>", "[/SYSTEM_PROMPT]"] +# If not specified, defaults to just the model's eos_token. +# This is useful for templates that use multiple delimiter tokens. +eot_tokens: + # - "</s>" + # - "[/INST]" + # - "[/SYSTEM_PROMPT]" +# Changes the default system message +default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml. +# Axolotl attempts to save the dataset as an arrow after packing the data together so +# subsequent training attempts load faster, relative path +dataset_prepared_path: data/last_run_prepared +# Push prepared dataset to hub +push_dataset_to_hub: # Optional[str] repo_org/repo_name +# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` +# if not set. +dataset_processes: # defaults to os.cpu_count() if not set +# Keep dataset in memory while preprocessing +# Only needed if cached dataset is taking too much storage +dataset_keep_in_memory: +# push checkpoints to hub +hub_model_id: # private repo path to push finetuned model +# how to push checkpoints to hub +# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy +hub_strategy: +# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets +# Required to be true when used in combination with `push_dataset_to_hub` +hf_use_auth_token: # boolean +# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval. +val_set_size: 0.04 +# Num shards for whole dataset +dataset_shard_num: +# Index of shard to use for whole dataset +dataset_shard_idx: + +# The maximum length of an input to train with, this should typically be less than 2048 +# as most models have a token/context limit of 2048 +sequence_len: 2048 +# Pad inputs so each step uses constant sized buffers +# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently +pad_to_sequence_len: +# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true' +sample_packing: +# Set to 'false' if getting errors during eval with sample_packing on. +eval_sample_packing: +# You can set these packing optimizations AFTER starting a training at least once. +# The trainer will provide recommended values for these values. +sample_packing_eff_est: +total_num_tokens: +# Increasing the following values helps with packing, but usually only slightly (<%1.) +# The number of samples packed at a time. +sample_packing_group_size: 100000 +# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples. +sample_packing_bin_size: 200 +sample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially. + +# whether to concatenate samples during pretraining +pretraining_sample_concatenation: + +curriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning + +# Use batch flattening for speedups when not using sample_packing +batch_flattening: + +# Passed through to transformers when loading the model when launched without accelerate +# Use `sequential` when training w/ model parallelism to limit memory +device_map: +# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model. +max_memory: + +# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model +adapter: lora +# If you already have a lora model trained that you want to load, put that here. +# This means after training, if you want to test the model, you should set this to the value of `output_dir`. +# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`. +lora_model_dir: + +# LoRA hyperparameters +# For more details about the following options, see: +# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2 +lora_r: 8 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: + - q_proj + - v_proj +# - k_proj +# - o_proj +# - gate_proj +# - down_proj +# - up_proj +lora_target_linear: # If true, will target all linear modules + +# List[int] | int. # The layer indices to transform, otherwise, apply to all layers +# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform +peft_layers_to_transform: -# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens. -# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models. -# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities. -# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994 -lora_modules_to_save: -# - embed_tokens -# - lm_head +# Optional[bool]. Whether to use DoRA. +# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora +peft_use_dora: + +# Optional[bool]. Whether to use RSLoRA. +# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora +peft_use_rslora: -lora_fan_in_fan_out: false - -# Apply custom LoRA autograd functions and activation function Triton kernels for -# speed and memory savings -# See: https://docs.axolotl.ai/docs/lora_optims.html -lora_mlp_kernel: true -lora_qkv_kernel: true -lora_o_kernel: true +# Optional[list[tuple[int, int]]]. List of layer indices to replicate. +# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora +peft_layer_replication: + +# bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq"] +# How to initialize LoRA weights. Default to True which is MS original implementation. +# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization +peft_init_lora_weights: -# LoRA+ hyperparameters -# For more details about the following options, see: -# https://arxiv.org/abs/2402.12354 and `src/axolotl/core/train_builder.py` -loraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4. -loraplus_lr_embedding: # loraplus learning rate for lora embedding layers. Default value is 1e-6. - -peft: - # Configuration options for loftq initialization for LoRA - # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization - loftq_config: - loftq_bits: # typically 4 bits - -# ReLoRA configuration -# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed -relora_steps: # Number of steps per ReLoRA restart -relora_warmup_steps: # Number of per-restart warmup steps -relora_anneal_steps: # Number of anneal steps for each relora cycle -relora_prune_ratio: # threshold for optimizer magnitude when pruning -relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings - -# wandb configuration if you're using it -# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`. -wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb -wandb_project: # Your wandb project name -wandb_entity: # A wandb Team name if using a Team -wandb_watch: -wandb_name: # Set the name of your wandb run -wandb_run_id: # Set the ID of your wandb run -wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training - -# mlflow configuration if you're using it -mlflow_tracking_uri: # URI to mlflow -mlflow_experiment_name: # Your experiment name -mlflow_run_name: # Your run name -hf_mlflow_log_artifacts: # set to true to copy each saved checkpoint on each save to mlflow artifact registry - -# Comet configuration if you're using it -# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`. -# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start -use_comet: # Enable or disable Comet integration. -comet_api_key: # API key for Comet. Recommended to set via `comet login`. -comet_workspace: # Workspace name in Comet. Defaults to the user's default workspace. -comet_project_name: # Project name in Comet. Defaults to Uncategorized. -comet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key. -comet_mode: # Create a new experiment ("create") or log to an existing one ("get"). Default ("get_or_create") auto-selects based on configuration. -comet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True. -comet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details. - -# Tensorboard -use_tensorboard: # Optional[bool] - -# Where to save the full-finetuned model to -output_dir: ./completed-model - -# Whether to use torch.compile and which backend to use -# setting to `auto` will enable torch compile when torch>=2.5.1 -torch_compile: # Optional[Union[Literal["auto"], bool]] -torch_compile_backend: # Optional[str] - -# Training hyperparameters - -# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps. -gradient_accumulation_steps: 1 -# The number of samples to include in each batch. This is the number of samples sent to each GPU. -# Batch size per gpu = micro_batch_size * gradient_accumulation_steps -micro_batch_size: 2 -eval_batch_size: -num_epochs: 4 -warmup_steps: 100 # cannot use with warmup_ratio -warmup_ratio: 0.05 # cannot use with warmup_steps -learning_rate: 0.00003 -lr_quadratic_warmup: -logging_steps: -eval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps -evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps -eval_strategy: # Set to `"no"` to skip evaluation, `"epoch"` at end of each epoch, leave empty to infer from `eval_steps`. -save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of each epoch, `"best"` when better result is achieved, leave empty to infer from `save_steps`. -save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps -saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps -save_total_limit: # Checkpoints saved at a time -save_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints. -# Maximum number of iterations to train for. It precedes num_epochs which means that -# if both are set, num_epochs will not be guaranteed. -# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps -max_steps: - -# bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time. -include_tokens_per_second: # Optional[bool] - -# whether to find batch size that fits in memory. Passed to underlying transformers Trainer -auto_find_batch_size: # Optional[bool] - -eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0 -eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128 -do_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`. -eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"] - -profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir. - # see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information - # snapshots can be visualized @ https://pytorch.org/memory_viz - -loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training) -loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3) - -# Save model as safetensors (require safetensors package) -save_safetensors: - -# Whether to mask out or include the human's prompt from the training labels -train_on_inputs: false -# Group similarly sized data to minimize padding. -# May be slower to start, as it must download and sort the entire dataset. -# Note that training loss may have an oscillating pattern with this enabled. -group_by_length: false +# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens. +# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models. +# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities. +# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994 +lora_modules_to_save: +# - embed_tokens +# - lm_head + +lora_fan_in_fan_out: false + +# Apply custom LoRA autograd functions and activation function Triton kernels for +# speed and memory savings +# See: https://docs.axolotl.ai/docs/lora_optims.html +lora_mlp_kernel: true +lora_qkv_kernel: true +lora_o_kernel: true + +# LoRA+ hyperparameters +# For more details about the following options, see: +# https://arxiv.org/abs/2402.12354 and `src/axolotl/core/train_builder.py` +loraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4. +loraplus_lr_embedding: # loraplus learning rate for lora embedding layers. Default value is 1e-6. + +peft: + # Configuration options for loftq initialization for LoRA + # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization + loftq_config: + loftq_bits: # typically 4 bits + +# ReLoRA configuration +# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed +relora_steps: # Number of steps per ReLoRA restart +relora_warmup_steps: # Number of per-restart warmup steps +relora_anneal_steps: # Number of anneal steps for each relora cycle +relora_prune_ratio: # threshold for optimizer magnitude when pruning +relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings + +# wandb configuration if you're using it +# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`. +wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb +wandb_project: # Your wandb project name +wandb_entity: # A wandb Team name if using a Team +wandb_watch: +wandb_name: # Set the name of your wandb run +wandb_run_id: # Set the ID of your wandb run +wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training + +# mlflow configuration if you're using it +mlflow_tracking_uri: # URI to mlflow +mlflow_experiment_name: # Your experiment name +mlflow_run_name: # Your run name +hf_mlflow_log_artifacts: # set to true to copy each saved checkpoint on each save to mlflow artifact registry + +# Comet configuration if you're using it +# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`. +# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start +use_comet: # Enable or disable Comet integration. +comet_api_key: # API key for Comet. Recommended to set via `comet login`. +comet_workspace: # Workspace name in Comet. Defaults to the user's default workspace. +comet_project_name: # Project name in Comet. Defaults to Uncategorized. +comet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key. +comet_mode: # Create a new experiment ("create") or log to an existing one ("get"). Default ("get_or_create") auto-selects based on configuration. +comet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True. +comet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details. + +# Tensorboard +use_tensorboard: # Optional[bool] + +# Where to save the full-finetuned model to +output_dir: ./completed-model + +# Whether to use torch.compile and which backend to use +# setting to `auto` will enable torch compile when torch>=2.5.1 +torch_compile: # Optional[Union[Literal["auto"], bool]] +torch_compile_backend: # Optional[str] + +# Training hyperparameters + +# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps. +gradient_accumulation_steps: 1 +# The number of samples to include in each batch. This is the number of samples sent to each GPU. +# Batch size per gpu = micro_batch_size * gradient_accumulation_steps +micro_batch_size: 2 +eval_batch_size: +num_epochs: 4 +warmup_steps: 100 # cannot use with warmup_ratio +warmup_ratio: 0.05 # cannot use with warmup_steps +learning_rate: 0.00003 +lr_quadratic_warmup: +logging_steps: +eval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps +evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps +eval_strategy: # Set to `"no"` to skip evaluation, `"epoch"` at end of each epoch, leave empty to infer from `eval_steps`. +save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of each epoch, `"best"` when better result is achieved, leave empty to infer from `save_steps`. +save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps +saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps +save_total_limit: # Checkpoints saved at a time +save_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints. +# Maximum number of iterations to train for. It precedes num_epochs which means that +# if both are set, num_epochs will not be guaranteed. +# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps +max_steps: + +# bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time. +include_tokens_per_second: # Optional[bool] + +# whether to find batch size that fits in memory. Passed to underlying transformers Trainer +auto_find_batch_size: # Optional[bool] + +eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0 +eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128 +do_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`. +eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"] -# Whether to use gradient checkpointing. Available options are: true, false, "offload", "offload_disk". -# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing -gradient_checkpointing: false -# additional kwargs to pass to the trainer for gradient checkpointing -# gradient_checkpointing_kwargs: -# use_reentrant: true +profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir. + # see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information + # snapshots can be visualized @ https://pytorch.org/memory_viz + +loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training) +loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3) -# Stop training after this many evaluation losses have increased in a row -# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback -early_stopping_patience: 3 - -# Specify a scheduler and kwargs to use with the optimizer -lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine -lr_scheduler_kwargs: -cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr -cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf) +# Save model as safetensors (require safetensors package) +save_safetensors: + +# Whether to mask out or include the human's prompt from the training labels +train_on_inputs: false +# Group similarly sized data to minimize padding. +# May be slower to start, as it must download and sort the entire dataset. +# Note that training loss may have an oscillating pattern with this enabled. +group_by_length: false -# For one_cycle optim -lr_div_factor: # Learning rate div factor - -# Specify optimizer -# Valid values are driven by the Transformers OptimizerNames class, see: -# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189 -# -# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of -# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used -# in the examples/ for your model and fine-tuning use case. -# -# Valid values for 'optimizer' include: -# - adamw_torch -# - adamw_torch_fused -# - adamw_torch_xla -# - adamw_torch_npu_fused -# - adamw_apex_fused -# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1) -# - adafactor -# - adamw_anyprecision -# - adamw_torch_4bit -# - ademamix -# - sgd -# - adagrad -# - adamw_bnb_8bit -# - adamw_8bit # alias for adamw_bnb_8bit -# - ademamix_8bit -# - lion_8bit -# - lion_32bit -# - paged_adamw_32bit -# - paged_adamw_8bit -# - paged_ademamix_32bit -# - paged_ademamix_8bit -# - paged_lion_32bit -# - paged_lion_8bit -# - rmsprop -# - rmsprop_bnb -# - rmsprop_bnb_8bit -# - rmsprop_bnb_32bit -# - galore_adamw -# - galore_adamw_8bit -# - galore_adafactor -# - galore_adamw_layerwise -# - galore_adamw_8bit_layerwise -# - galore_adafactor_layerwise -# - lomo -# - adalomo -# - grokadamw -# - schedule_free_adamw -# - schedule_free_sgd -# - apollo_adamw -# - apollo_adamw_layerwise -# -# Additional custom optimizers include: -# - optimi_adamw -# - ao_adamw_8bit -# - ao_adamw_fp8 -# - came_pytorch -optimizer: -# Dictionary of arguments to pass to the optimizer -optim_args: -# For Galore Optimizers the following optim_args are available -# rank: # type: int -# update_proj_gap # type: int -# scale # type: float -# proj_type: # type: str, default = std - -# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm -optim_target_modules: -# - self_attn # for llama -# - mlp - -# Specify weight decay -weight_decay: -# adamw hyperparams -adam_beta1: -adam_beta2: -adam_beta3: # only used for CAME Optimizer -adam_epsilon: -adam_epsilon2: # only used for CAME Optimizer -# Gradient clipping max norm -max_grad_norm: - -# Augmentation techniques -# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings -# currently only supported on Llama and Mistral -neftune_noise_alpha: - -# Optional[bool]. Whether to bettertransformers -flash_optimum: - -# Note: Only one of the following attention patches can be used at a time. -# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`. - -# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers: -xformers_attention: -# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention: -flash_attention: -flash_attn_cross_entropy: # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only -flash_attn_rms_norm: # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only -flash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation -flash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation -# Optional[bool]. Whether to use scaled-dot-product attention -# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html -sdp_attention: -# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf -s2_attention: - -# Optional[bool]. Whether to use low_cpu_mem_usage -low_cpu_mem_usage: -# Optional[str]. Resume from a specific checkpoint dir -resume_from_checkpoint: -# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off. -# Be careful with this being turned on between different models. -auto_resume_from_checkpoints: false - -## Multimodal section -# int | tuple[int, int] | None . Size to resize images to, width x height. -# Will read from model/processor config if not set. -image_size: -# str. Algorithm to use for image resizing. "bilinear", "bicubic", "lanczos". Default is "bilinear". -image_resize_algorithm: 'bilinear' -## End of multimodal section - -# Don't mess with this, it's here for accelerate and torchrun -local_rank: - -# Add or change special tokens. -# If you add tokens here, you don't need to add them to the `tokens` list. -special_tokens: - # bos_token: "<s>" - # eos_token: "</s>" - # unk_token: "<unk>" - # pad_token: "[PAD]" - -# Optional[list[str]]. Add extra tokens to the tokenizer. -tokens: - # - "<|startoftext|>" - # - "<|endoftext|>" - -# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer. -# Only works for tokens that are not part of the base vocab (aka are added_tokens). -# Can be checked if they exist in tokenizer.json added_tokens. -added_tokens_overrides: # Dict[int, str] -# 128041: "<|im_start|>" -# 128042: "<|im_end|>" - -# FSDP -fsdp: -fsdp_config: - -# Deepspeed config path. e.g., deepspeed_configs/zero3.json -deepspeed: - -# Advanced DDP Arguments -ddp_timeout: -ddp_bucket_cap_mb: -ddp_broadcast_buffers: - -# Sequence parallelism -# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size. -# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM. -# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized -# subsequences, or set to 4 to split into four equal-sized subsequences. -# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details. -sequence_parallel_degree: -# Optional; strides across the key dimension. Larger values use more memory but should make training faster. -# Must evenly divide the number of KV heads in your model. -heads_k_stride: 1 -# One of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to "varlen_llama3" -# in the sample packing case, and "batch_ring" in the non-sample packing case. -ring_attn_func: - -# Path to torch distx for optim 'adamw_anyprecision' -torchdistx_path: - -# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize -pretraining_dataset: - -# Debug mode -debug: - -# Seed -seed: +# Whether to use gradient checkpointing. Available options are: true, false, "offload", "offload_disk". +# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing +gradient_checkpointing: false +# additional kwargs to pass to the trainer for gradient checkpointing +# gradient_checkpointing_kwargs: +# use_reentrant: true + +# Stop training after this many evaluation losses have increased in a row +# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback +early_stopping_patience: 3 + +# Specify a scheduler and kwargs to use with the optimizer +# Valid values are driven by the Transformers SchedulerType class, see: +# https://github.com/huggingface/transformers/blob/5f4ecf2d9f867a1255131d2461d75793c0cf1db2/src/transformers/trainer_utils.py#L420 +# Valid values include +# - 'linear' +# - 'cosine' (default) +# - 'cosine_with_restarts' +# - 'polynomial' +# - 'constant' +# - 'constant_with_warmup' +# - 'inverse_sqrt' +# - 'reduce_lr_on_plateau' +# - 'cosine_with_min_lr' +# - 'warmup_stable_decay' + +# Additional schedulers include: +# - 'one_cycle' +# - 'rex' +lr_scheduler: +lr_scheduler_kwargs: +cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr +cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf) + +# For one_cycle optim +lr_div_factor: # Learning rate div factor + +# Specify optimizer +# Valid values are driven by the Transformers OptimizerNames class, see: +# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189 +# +# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of +# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used +# in the examples/ for your model and fine-tuning use case. +# +# Valid values for 'optimizer' include: +# - adamw_torch +# - adamw_torch_fused (default) +# - adamw_torch_xla +# - adamw_torch_npu_fused +# - adamw_apex_fused +# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1) +# - adafactor +# - adamw_anyprecision +# - adamw_torch_4bit +# - ademamix +# - sgd +# - adagrad +# - adamw_bnb_8bit +# - adamw_8bit # alias for adamw_bnb_8bit +# - ademamix_8bit +# - lion_8bit +# - lion_32bit +# - paged_adamw_32bit +# - paged_adamw_8bit +# - paged_ademamix_32bit +# - paged_ademamix_8bit +# - paged_lion_32bit +# - paged_lion_8bit +# - rmsprop +# - rmsprop_bnb +# - rmsprop_bnb_8bit +# - rmsprop_bnb_32bit +# - galore_adamw +# - galore_adamw_8bit +# - galore_adafactor +# - galore_adamw_layerwise +# - galore_adamw_8bit_layerwise +# - galore_adafactor_layerwise +# - lomo +# - adalomo +# - grokadamw +# - schedule_free_adamw +# - schedule_free_sgd +# - apollo_adamw +# - apollo_adamw_layerwise +# +# Additional custom optimizers include: +# - optimi_adamw +# - ao_adamw_8bit +# - ao_adamw_fp8 +# - came_pytorch +optimizer: +# Dictionary of arguments to pass to the optimizer +optim_args: +# For Galore Optimizers the following optim_args are available +# rank: # type: int +# update_proj_gap # type: int +# scale # type: float +# proj_type: # type: str, default = std + +# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm +optim_target_modules: +# - self_attn # for llama +# - mlp + +# Specify weight decay +weight_decay: +# adamw hyperparams +adam_beta1: +adam_beta2: +adam_beta3: # only used for CAME Optimizer +adam_epsilon: +adam_epsilon2: # only used for CAME Optimizer +# Gradient clipping max norm +max_grad_norm: + +# Augmentation techniques +# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings +# currently only supported on Llama and Mistral +neftune_noise_alpha: + +# Optional[bool]. Whether to bettertransformers +flash_optimum: + +# Note: Only one of the following attention patches can be used at a time. +# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`. + +# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers: +xformers_attention: +# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention: +flash_attention: +flash_attn_cross_entropy: # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only +flash_attn_rms_norm: # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only +flash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation +flash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation +# Optional[bool]. Whether to use scaled-dot-product attention +# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html +sdp_attention: +# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf +s2_attention: + +# Optional[bool]. Whether to use low_cpu_mem_usage +low_cpu_mem_usage: +# Optional[str]. Resume from a specific checkpoint dir +resume_from_checkpoint: +# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off. +# Be careful with this being turned on between different models. +auto_resume_from_checkpoints: false + +## Multimodal section +# int | tuple[int, int] | None . Size to resize images to, width x height. +# Will read from model/processor config if not set. +image_size: +# str. Algorithm to use for image resizing. "bilinear", "bicubic", "lanczos". Default is "bilinear". +image_resize_algorithm: 'bilinear' +## End of multimodal section + +# Don't mess with this, it's here for accelerate and torchrun +local_rank: + +# Add or change special tokens. +# If you add tokens here, you don't need to add them to the `tokens` list. +special_tokens: + # bos_token: "<s>" + # eos_token: "</s>" + # unk_token: "<unk>" + # pad_token: "[PAD]" + +# Optional[list[str]]. Add extra tokens to the tokenizer. +tokens: + # - "<|startoftext|>" + # - "<|endoftext|>" + +# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer. +# Only works for tokens that are not part of the base vocab (aka are added_tokens). +# Can be checked if they exist in tokenizer.json added_tokens. +added_tokens_overrides: # Dict[int, str] +# 128041: "<|im_start|>" +# 128042: "<|im_end|>" + +# FSDP +fsdp: +fsdp_config: -# Allow overwrite yml config using from cli -strict: +# Deepspeed config path. e.g., deepspeed_configs/zero3.json +deepspeed: + +# Advanced DDP Arguments +ddp_timeout: +ddp_bucket_cap_mb: +ddp_broadcast_buffers: + +# Sequence parallelism +# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size. +# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM. +# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized +# subsequences, or set to 4 to split into four equal-sized subsequences. +# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details. +sequence_parallel_degree: +# Optional; strides across the key dimension. Larger values use more memory but should make training faster. +# Must evenly divide the number of KV heads in your model. +heads_k_stride: 1 +# One of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to "varlen_llama3" +# in the sample packing case, and "batch_ring" in the non-sample packing case. +ring_attn_func: + +# Path to torch distx for optim 'adamw_anyprecision' +torchdistx_path: + +# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize +pretraining_dataset: + +# Debug mode +debug: + +# Seed +seed: + +# Allow overwrite yml config using from cli +strict: diff --git a/docs/custom_integrations.html b/docs/custom_integrations.html index 08f78102b..821d515ad 100644 --- a/docs/custom_integrations.html +++ b/docs/custom_integrations.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/dataset-formats/conversation.html b/docs/dataset-formats/conversation.html index 8e8daa53b..94ef00ea9 100644 --- a/docs/dataset-formats/conversation.html +++ b/docs/dataset-formats/conversation.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/dataset-formats/index.html b/docs/dataset-formats/index.html index fe4c6bb5f..0dd97607a 100644 --- a/docs/dataset-formats/index.html +++ b/docs/dataset-formats/index.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + @@ -538,19 +547,6 @@ Tip ...

It is typically recommended to save your dataset as .jsonl due to its flexibility and simplicity.

Axolotl supports loading from a Hugging Face hub repo or from local files.

-
-
-
- -
-
-Important -
-
-
-

For pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.

-
-

Pre-training from Hugging Face hub datasets

As an example, to train using a Hugging Face dataset hf_org/name, you can pass the following config:

@@ -575,14 +571,26 @@ Important
datasets:
   - path: hf_org/name
     type: completion
-

From local files (either example works):

+

From local files:

datasets:
   - path: A.jsonl
     type: completion
 
-  - path: json
-    data_files: ["A.jsonl", "B.jsonl", "C.jsonl"]
-    type: completion
+ - path: B.jsonl + type: completion +
+
+
+ +
+
+Important +
+
+
+

For completion only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for pretraining_dataset too, please let us know or help make a PR!

+
+

Pre-training dataset configuration tips

diff --git a/docs/dataset-formats/inst_tune.html b/docs/dataset-formats/inst_tune.html index 9fbd65049..52eb888a0 100644 --- a/docs/dataset-formats/inst_tune.html +++ b/docs/dataset-formats/inst_tune.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/dataset-formats/pretraining.html b/docs/dataset-formats/pretraining.html index 0cdd1ef0d..70ff38c24 100644 --- a/docs/dataset-formats/pretraining.html +++ b/docs/dataset-formats/pretraining.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/dataset-formats/stepwise_supervised.html b/docs/dataset-formats/stepwise_supervised.html index 130758c08..7573bef9b 100644 --- a/docs/dataset-formats/stepwise_supervised.html +++ b/docs/dataset-formats/stepwise_supervised.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/dataset-formats/template_free.html b/docs/dataset-formats/template_free.html index d08604ab6..e1ef554cc 100644 --- a/docs/dataset-formats/template_free.html +++ b/docs/dataset-formats/template_free.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/dataset-formats/tokenized.html b/docs/dataset-formats/tokenized.html index ce5a8b69a..1725ef115 100644 --- a/docs/dataset-formats/tokenized.html +++ b/docs/dataset-formats/tokenized.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/dataset_loading.html b/docs/dataset_loading.html index e778ebcb5..905879770 100644 --- a/docs/dataset_loading.html +++ b/docs/dataset_loading.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + @@ -559,19 +568,15 @@ Note

Local dataset

Files

-

Usually, to load a JSON file, you would do something like this:

+

To load a JSON file, you would do something like this:

from datasets import load_dataset
 
 dataset = load_dataset("json", data_files="data.json")

Which translates to the following config:

datasets:
-  - path: json
-    data_files: /path/to/your/file.jsonl
-

However, to make things easier, we have added a few shortcuts for loading local dataset files.

-

You can just point the path to the file or directory along with the ds_type to load the dataset. The below example shows for a JSON file:

-
datasets:
-  - path: /path/to/your/file.jsonl
-    ds_type: json
+ - path: data.json + ds_type: json +

In the example above, it can be seen that we can just point the path to the file or directory along with the ds_type to load the dataset.

This works for CSV, JSON, Parquet, and Arrow files.

@@ -597,31 +602,31 @@ Tip

We will attempt to load in the following order: - datasets saved with datasets.save_to_disk - loading entire directory of files (such as with parquet/arrow files)

-
datasets:
-  - path: /path/to/your/directory
+
datasets:
+  - path: /path/to/your/directory
Loading specific files in directory

Provide data_files with a list of files to load.

-
datasets:
-    # single file
-  - path: /path/to/your/directory
-    ds_type: csv
-    data_files: file1.csv
-
-    # multiple files
-  - path: /path/to/your/directory
-    ds_type: json
-    data_files:
-      - file1.jsonl
-      - file2.jsonl
-
-    # multiple files for parquet
-  - path: /path/to/your/directory
-    ds_type: parquet
-    data_files:
-      - file1.parquet
-      - file2.parquet
+
datasets:
+    # single file
+  - path: /path/to/your/directory
+    ds_type: csv
+    data_files: file1.csv
+
+    # multiple files
+  - path: /path/to/your/directory
+    ds_type: json
+    data_files:
+      - file1.jsonl
+      - file2.jsonl
+
+    # multiple files for parquet
+  - path: /path/to/your/directory
+    ds_type: parquet
+    data_files:
+      - file1.parquet
+      - file2.parquet
@@ -644,17 +649,17 @@ Note

Folder uploaded

This would mean that the dataset is a single file or file(s) uploaded to the Hub.

-
datasets:
-  - path: org/dataset-name
-    data_files:
-      - file1.jsonl
-      - file2.jsonl
+
datasets:
+  - path: org/dataset-name
+    data_files:
+      - file1.jsonl
+      - file2.jsonl

HuggingFace Dataset

This means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via datasets.push_to_hub.

-
datasets:
-  - path: org/dataset-name
+
datasets:
+  - path: org/dataset-name
@@ -687,12 +692,12 @@ Warning

The only difference between the providers is that you need to prepend the path with the respective protocols.

-
datasets:
-    # Single file
-  - path: s3://bucket-name/path/to/your/file.jsonl
-
-    # Directory
-  - path: s3://bucket-name/path/to/your/directory
+
datasets:
+    # Single file
+  - path: s3://bucket-name/path/to/your/file.jsonl
+
+    # Directory
+  - path: s3://bucket-name/path/to/your/directory

For directory, we load via load_from_disk.

S3

@@ -769,8 +774,8 @@ Note

HTTPS

The path should start with https://.

-
datasets:
-  - path: https://path/to/your/dataset/file.jsonl
+
datasets:
+  - path: https://path/to/your/dataset/file.jsonl

This must be publically accessible.

diff --git a/docs/dataset_preprocessing.html b/docs/dataset_preprocessing.html index 6c84b5813..e395bdcc2 100644 --- a/docs/dataset_preprocessing.html +++ b/docs/dataset_preprocessing.html @@ -68,6 +68,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/debugging.html b/docs/debugging.html index a88689b85..ca00b99c0 100644 --- a/docs/debugging.html +++ b/docs/debugging.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/docker.html b/docs/docker.html index fc8f7438f..11570e575 100644 --- a/docs/docker.html +++ b/docs/docker.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/faq.html b/docs/faq.html index 02f8337a1..ca5dad691 100644 --- a/docs/faq.html +++ b/docs/faq.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/fsdp_qlora.html b/docs/fsdp_qlora.html index e45b081f9..41a89c5e4 100644 --- a/docs/fsdp_qlora.html +++ b/docs/fsdp_qlora.html @@ -68,6 +68,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/getting-started.html b/docs/getting-started.html index c5883fb44..ca9ede315 100644 --- a/docs/getting-started.html +++ b/docs/getting-started.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/inference.html b/docs/inference.html index 3a97df337..ab381108a 100644 --- a/docs/inference.html +++ b/docs/inference.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/input_output.html b/docs/input_output.html index e62c96320..408eb0e5e 100644 --- a/docs/input_output.html +++ b/docs/input_output.html @@ -68,6 +68,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/installation.html b/docs/installation.html index e635f1aed..092c94089 100644 --- a/docs/installation.html +++ b/docs/installation.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/lora_optims.html b/docs/lora_optims.html index d6a8ff7e3..582b154f2 100644 --- a/docs/lora_optims.html +++ b/docs/lora_optims.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + @@ -559,6 +568,19 @@ projection, respectively.

lora_mlp_kernel: true
 lora_qkv_kernel: true
 lora_o_kernel: true
+
+
+
+ +
+
+Note +
+
+
+

Currently, LoRA kernels are not supported for RLHF training, only SFT.

+
+

Requirements

diff --git a/docs/lr_groups.html b/docs/lr_groups.html index 612f5e5a8..b7426d054 100644 --- a/docs/lr_groups.html +++ b/docs/lr_groups.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/mac.html b/docs/mac.html index 809013f45..370dec7b6 100644 --- a/docs/mac.html +++ b/docs/mac.html @@ -68,6 +68,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/multi-gpu.html b/docs/multi-gpu.html index 277313b69..e0d9d87ed 100644 --- a/docs/multi-gpu.html +++ b/docs/multi-gpu.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/multi-node.html b/docs/multi-node.html index acf953ca8..9b923439f 100644 --- a/docs/multi-node.html +++ b/docs/multi-node.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/multimodal.html b/docs/multimodal.html index bb95fe2c4..7cd11f3e3 100644 --- a/docs/multimodal.html +++ b/docs/multimodal.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/multipack.html b/docs/multipack.html index 1baae29b0..4cf8eb457 100644 --- a/docs/multipack.html +++ b/docs/multipack.html @@ -68,6 +68,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/docs/nccl.html b/docs/nccl.html index af923b1af..e939c6f2a 100644 --- a/docs/nccl.html +++ b/docs/nccl.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/ray-integration.html b/docs/ray-integration.html index 67d50cdde..ba101d852 100644 --- a/docs/ray-integration.html +++ b/docs/ray-integration.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/reward_modelling.html b/docs/reward_modelling.html index 509c196b5..5fcc437ff 100644 --- a/docs/reward_modelling.html +++ b/docs/reward_modelling.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/rlhf.html b/docs/rlhf.html index c9b601f5a..373f100b1 100644 --- a/docs/rlhf.html +++ b/docs/rlhf.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + @@ -490,6 +499,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
  • GRPO
  • SimPO
  • Using local dataset files
  • @@ -534,7 +544,7 @@ feedback. Various methods include, but not limited to:

  • Identity Preference Optimization (IPO)
  • Kahneman-Tversky Optimization (KTO)
  • Odds Ratio Preference Optimization (ORPO)
  • -
  • Proximal Policy Optimization (PPO) (not yet supported in axolotl)
  • +
  • Proximal Policy Optimization (PPO) (not yet supported in axolotl, if you’re interested in contributing, please reach out!)
  • @@ -1042,32 +1052,41 @@ Note name: main type: rewards.oai_gsm8k_transform # format: '{file_name}.{fn_name}'

    To see other examples of custom reward functions, please see TRL GRPO Docs.

    -

    To see description of the configs, please see TRLConfig.

    +

    To see all configs, please see TRLConfig.

    +
    +
    +

    GRPO with DAPO/Dr. GRPO loss

    +

    The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.

    +
    trl:
    +  loss_type: dr_grpo
    +  # Normalizes loss based on max completion length (default: 256)
    +  max_completion_length:
    +

    For more information, see GRPO docs.

    SimPO

    SimPO uses CPOTrainer but with alternative loss function.

    -
    rl: simpo
    -rl_beta: 0.1  # default in CPOTrainer
    -cpo_alpha: 1.0  # default in CPOTrainer
    -simpo_gamma: 0.5  # default in CPOTrainer
    +
    rl: simpo
    +rl_beta: 0.1  # default in CPOTrainer
    +cpo_alpha: 1.0  # default in CPOTrainer
    +simpo_gamma: 0.5  # default in CPOTrainer

    This method uses the same dataset format as DPO.

    Using local dataset files

    -
    datasets:
    -  - ds_type: json
    -    data_files:
    -      - orca_rlhf.jsonl
    -    split: train
    -    type: chatml.intel
    +
    datasets:
    +  - ds_type: json
    +    data_files:
    +      - orca_rlhf.jsonl
    +    split: train
    +    type: chatml.intel

    TRL auto-unwrapping for PEFT

    TRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config:

    -
    # load ref model when adapter training.
    -rl_adapter_ref_model: true
    +
    # load ref model when adapter training.
    +rl_adapter_ref_model: true
    diff --git a/docs/sequence_parallelism.html b/docs/sequence_parallelism.html index d7d35eb71..d4037c6dd 100644 --- a/docs/sequence_parallelism.html +++ b/docs/sequence_parallelism.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/torchao.html b/docs/torchao.html index e68040bfd..0b6b8e356 100644 --- a/docs/torchao.html +++ b/docs/torchao.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/docs/unsloth.html b/docs/unsloth.html index 0807b02da..3c91d114a 100644 --- a/docs/unsloth.html +++ b/docs/unsloth.html @@ -103,6 +103,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/examples/colab-notebooks/colab-axolotl-example.html b/examples/colab-notebooks/colab-axolotl-example.html index 661496cb5..34219bb15 100644 --- a/examples/colab-notebooks/colab-axolotl-example.html +++ b/examples/colab-notebooks/colab-axolotl-example.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/index.html b/index.html index ecdbcacdb..c226fe22a 100644 --- a/index.html +++ b/index.html @@ -102,6 +102,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin "search-label": "Search" } } + + + diff --git a/search.json b/search.json index 62590159f..7b101bd5e 100644 --- a/search.json +++ b/search.json @@ -84,7 +84,7 @@ "href": "docs/rlhf.html", "title": "RLHF (Beta)", "section": "", - "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human\nfeedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl)", + "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human\nfeedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl, if you’re interested in contributing, please reach out!)", "crumbs": [ "How To Guides", "RLHF (Beta)" @@ -95,7 +95,7 @@ "href": "docs/rlhf.html#overview", "title": "RLHF (Beta)", "section": "", - "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human\nfeedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl)", + "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human\nfeedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl, if you’re interested in contributing, please reach out!)", "crumbs": [ "How To Guides", "RLHF (Beta)" @@ -106,7 +106,7 @@ "href": "docs/rlhf.html#rlhf-using-axolotl", "title": "RLHF (Beta)", "section": "RLHF using Axolotl", - "text": "RLHF using Axolotl\n\n\n\n\n\n\nImportant\n\n\n\nThis is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.\n\n\nWe rely on the TRL library for implementations of various RL training methods, which we wrap around to expose in axolotl. Each method has their own supported ways of loading datasets and prompt formats.\n\n\n\n\n\n\nTip\n\n\n\nYou can find what each method supports by going into src/axolotl/prompt_strategies/{method} where {method} is one of our supported methods. The type: can be retrieved from {method}.{function_name}.\n\n\n\nDPO\nExample config:\nrl: dpo\ndatasets:\n - path: Intel/orca_dpo_pairs\n split: train\n type: chatml.intel\n - path: argilla/ultrafeedback-binarized-preferences\n split: train\n type: chatml\nDPO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"chosen_response\": \"...\",\n \"rejected_response\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nchatml.icr\n{\n \"system\": \"...\", // optional\n \"input\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"chosen_response\": \"...\",\n \"rejected_response\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.icr\n{\n \"system\": \"...\", // optional\n \"input\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nzephyr.nectar\n{\n \"prompt\": \"...\",\n \"answers\": [\n {\n \"answer\": \"...\",\n \"rank\": 1\n },\n {\n \"answer\": \"...\",\n \"rank\": 2\n }\n // ... more answers with ranks\n ]\n}\n\n\nchat_template.default\nrl: dpo\ndatasets:\n - path: ...\n split: train\n type: chat_template.default\n field_messages: \"messages\"\n field_chosen: \"chosen\"\n field_rejected: \"rejected\"\n message_property_mappings:\n role: role\n content: content\n roles:\n user: [\"user\"]\n assistant: [\"assistant\"]\n system: [\"system\"]\nSample input format:\n{\n \"messages\": [\n {\n \"role\": \"system\",\n \"content\": \"...\"\n },\n {\n \"role\": \"user\",\n \"content\": \"...\"\n },\n // ... more messages\n ],\n \"chosen\": {\n \"role\": \"assistant\",\n \"content\": \"...\"\n },\n \"rejected\": {\n \"role\": \"assistant\",\n \"content\": \"...\"\n }\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: dpo\ndatasets:\n - path: ...\n split: train\n type: user_defined.default\n\n field_prompt: \"prompt\"\n field_system: \"system\"\n field_chosen: \"chosen\"\n field_rejected: \"rejected\"\n prompt_format: \"{prompt}\"\n chosen_format: \"{chosen}\"\n rejected_format: \"{rejected}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\n\nIPO\nAs IPO is just DPO with a different loss function, all supported dataset formats for DPO are also supported for IPO.\nrl: ipo\n\n\nORPO\nPaper: https://arxiv.org/abs/2403.07691\nrl: orpo\norpo_alpha: 0.1\nremove_unused_columns: false\n\nchat_template: chatml\ndatasets:\n - path: argilla/ultrafeedback-binarized-preferences-cleaned\n type: chat_template.argilla\nORPO supports the following types with the following dataset format:\n\nchat_template.argilla\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\", // if available, will be taken as user message for single-turn instead of from list below\n\n // chosen/rejected should be same till last content and only even-number of alternating user/assistant turns\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\n\nKTO\nrl: kto\nrl_beta: 0.1 # default\nkto_desirable_weight: 1.0 # default\nkto_undesirable_weight: 1.0 # default\n\nremove_unused_columns: false\n\ndatasets:\n - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto\n type: llama3.ultra\n split: train\n\ngradient_checkpointing: true\ngradient_checkpointing_kwargs:\n use_reentrant: true\nKTO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"}\n ],\n \"completion\": [\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nchatml.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n \"completion\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: kto\ndatasets:\n - path: ...\n split: train\n type: user_defined.default\n\n field_prompt: \"prompt\"\n field_system: \"system\"\n field_completion: \"completion\"\n field_label: \"label\"\n prompt_format: \"{prompt}\"\n completion_format: \"{completion}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\",\n \"label\": \"...\"\n}\n\n\n\nGRPO\n\n\n\n\n\n\nTip\n\n\n\nCheck out our GRPO cookbook.\n\n\nIn the latest GRPO implementation, vLLM is used to significantly speedup trajectory generation during training. In this example, we’re using 4 GPUs - 2 for training, and 2 for vLLM:\n\n\n\n\n\n\nImportant\n\n\n\nMake sure you’ve installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. pip install axolotl[vllm].\n\n\nbase_model: Qwen/Qwen2.5-1.5B-Instruct\n\nvllm:\n host: 0.0.0.0\n port: 8000\n tensor_parallel_size: 2\n gpu_memory_utilization: 0.85\n dtype: auto\n # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand\n\nrl: grpo\ntrl:\n use_vllm: true\n vllm_server_host: 0.0.0.0\n vllm_server_port: 8000\n vllm_server_timeout: 300\nCUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml\nYour vLLM instance will now attempt to spin up, and it’s time to kick off training utilizing our remaining two GPUs. In another terminal, execute:\nCUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2\n\n\n\n\n\n\nNote\n\n\n\nDue to TRL’s implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use CUDA_VISIBLE_DEVICES=2,3 for the vLLM instance.\n\n\n\nReward functions\nGRPO uses custom reward functions and transformations. Please have them ready locally.\nFor example, to load OpenAI’s GSM8K and use a random reward for completions:\n# rewards.py\nimport random\n\ndef rand_reward_func(completions, **kwargs) -> list[float]:\n return [random.uniform(0, 1) for _ in completions]\n\ndef oai_gsm8k_transform(cfg, *args, **kwargs):\n def transform_fn(example, tokenizer=None):\n label = example[\"answer\"].split(\"####\")[-1].strip().replace(\",\", \"\")\n return {\n \"prompt\": [{\"role\": \"user\", \"content\": example[\"question\"]},],\n \"answer\": label,\n }\n return transform_fn, {\"remove_columns\": [\"question\"]}\nrl: grpo\n\ntrl:\n beta: 0.001\n max_completion_length: 256\n use_vllm: True\n num_generations: 4\n reward_funcs: [\"rewards.rand_reward_func\"] # format: '{file_name}.{fn_name}'\n reward_weights: [1.0]\ndatasets:\n - path: openai/gsm8k\n name: main\n type: rewards.oai_gsm8k_transform # format: '{file_name}.{fn_name}'\nTo see other examples of custom reward functions, please see TRL GRPO Docs.\nTo see description of the configs, please see TRLConfig.\n\n\n\nSimPO\nSimPO uses CPOTrainer but with alternative loss function.\nrl: simpo\nrl_beta: 0.1 # default in CPOTrainer\ncpo_alpha: 1.0 # default in CPOTrainer\nsimpo_gamma: 0.5 # default in CPOTrainer\nThis method uses the same dataset format as DPO.\n\n\nUsing local dataset files\ndatasets:\n - ds_type: json\n data_files:\n - orca_rlhf.jsonl\n split: train\n type: chatml.intel\n\n\nTRL auto-unwrapping for PEFT\nTRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config:\n# load ref model when adapter training.\nrl_adapter_ref_model: true", + "text": "RLHF using Axolotl\n\n\n\n\n\n\nImportant\n\n\n\nThis is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.\n\n\nWe rely on the TRL library for implementations of various RL training methods, which we wrap around to expose in axolotl. Each method has their own supported ways of loading datasets and prompt formats.\n\n\n\n\n\n\nTip\n\n\n\nYou can find what each method supports by going into src/axolotl/prompt_strategies/{method} where {method} is one of our supported methods. The type: can be retrieved from {method}.{function_name}.\n\n\n\nDPO\nExample config:\nrl: dpo\ndatasets:\n - path: Intel/orca_dpo_pairs\n split: train\n type: chatml.intel\n - path: argilla/ultrafeedback-binarized-preferences\n split: train\n type: chatml\nDPO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"chosen_response\": \"...\",\n \"rejected_response\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nchatml.icr\n{\n \"system\": \"...\", // optional\n \"input\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"chosen_response\": \"...\",\n \"rejected_response\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.icr\n{\n \"system\": \"...\", // optional\n \"input\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nzephyr.nectar\n{\n \"prompt\": \"...\",\n \"answers\": [\n {\n \"answer\": \"...\",\n \"rank\": 1\n },\n {\n \"answer\": \"...\",\n \"rank\": 2\n }\n // ... more answers with ranks\n ]\n}\n\n\nchat_template.default\nrl: dpo\ndatasets:\n - path: ...\n split: train\n type: chat_template.default\n field_messages: \"messages\"\n field_chosen: \"chosen\"\n field_rejected: \"rejected\"\n message_property_mappings:\n role: role\n content: content\n roles:\n user: [\"user\"]\n assistant: [\"assistant\"]\n system: [\"system\"]\nSample input format:\n{\n \"messages\": [\n {\n \"role\": \"system\",\n \"content\": \"...\"\n },\n {\n \"role\": \"user\",\n \"content\": \"...\"\n },\n // ... more messages\n ],\n \"chosen\": {\n \"role\": \"assistant\",\n \"content\": \"...\"\n },\n \"rejected\": {\n \"role\": \"assistant\",\n \"content\": \"...\"\n }\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: dpo\ndatasets:\n - path: ...\n split: train\n type: user_defined.default\n\n field_prompt: \"prompt\"\n field_system: \"system\"\n field_chosen: \"chosen\"\n field_rejected: \"rejected\"\n prompt_format: \"{prompt}\"\n chosen_format: \"{chosen}\"\n rejected_format: \"{rejected}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\n\nIPO\nAs IPO is just DPO with a different loss function, all supported dataset formats for DPO are also supported for IPO.\nrl: ipo\n\n\nORPO\nPaper: https://arxiv.org/abs/2403.07691\nrl: orpo\norpo_alpha: 0.1\nremove_unused_columns: false\n\nchat_template: chatml\ndatasets:\n - path: argilla/ultrafeedback-binarized-preferences-cleaned\n type: chat_template.argilla\nORPO supports the following types with the following dataset format:\n\nchat_template.argilla\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\", // if available, will be taken as user message for single-turn instead of from list below\n\n // chosen/rejected should be same till last content and only even-number of alternating user/assistant turns\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\n\nKTO\nrl: kto\nrl_beta: 0.1 # default\nkto_desirable_weight: 1.0 # default\nkto_undesirable_weight: 1.0 # default\n\nremove_unused_columns: false\n\ndatasets:\n - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto\n type: llama3.ultra\n split: train\n\ngradient_checkpointing: true\ngradient_checkpointing_kwargs:\n use_reentrant: true\nKTO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"}\n ],\n \"completion\": [\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nchatml.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n \"completion\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: kto\ndatasets:\n - path: ...\n split: train\n type: user_defined.default\n\n field_prompt: \"prompt\"\n field_system: \"system\"\n field_completion: \"completion\"\n field_label: \"label\"\n prompt_format: \"{prompt}\"\n completion_format: \"{completion}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\",\n \"label\": \"...\"\n}\n\n\n\nGRPO\n\n\n\n\n\n\nTip\n\n\n\nCheck out our GRPO cookbook.\n\n\nIn the latest GRPO implementation, vLLM is used to significantly speedup trajectory generation during training. In this example, we’re using 4 GPUs - 2 for training, and 2 for vLLM:\n\n\n\n\n\n\nImportant\n\n\n\nMake sure you’ve installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. pip install axolotl[vllm].\n\n\nbase_model: Qwen/Qwen2.5-1.5B-Instruct\n\nvllm:\n host: 0.0.0.0\n port: 8000\n tensor_parallel_size: 2\n gpu_memory_utilization: 0.85\n dtype: auto\n # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand\n\nrl: grpo\ntrl:\n use_vllm: true\n vllm_server_host: 0.0.0.0\n vllm_server_port: 8000\n vllm_server_timeout: 300\nCUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml\nYour vLLM instance will now attempt to spin up, and it’s time to kick off training utilizing our remaining two GPUs. In another terminal, execute:\nCUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2\n\n\n\n\n\n\nNote\n\n\n\nDue to TRL’s implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use CUDA_VISIBLE_DEVICES=2,3 for the vLLM instance.\n\n\n\nReward functions\nGRPO uses custom reward functions and transformations. Please have them ready locally.\nFor example, to load OpenAI’s GSM8K and use a random reward for completions:\n# rewards.py\nimport random\n\ndef rand_reward_func(completions, **kwargs) -> list[float]:\n return [random.uniform(0, 1) for _ in completions]\n\ndef oai_gsm8k_transform(cfg, *args, **kwargs):\n def transform_fn(example, tokenizer=None):\n label = example[\"answer\"].split(\"####\")[-1].strip().replace(\",\", \"\")\n return {\n \"prompt\": [{\"role\": \"user\", \"content\": example[\"question\"]},],\n \"answer\": label,\n }\n return transform_fn, {\"remove_columns\": [\"question\"]}\nrl: grpo\n\ntrl:\n beta: 0.001\n max_completion_length: 256\n use_vllm: True\n num_generations: 4\n reward_funcs: [\"rewards.rand_reward_func\"] # format: '{file_name}.{fn_name}'\n reward_weights: [1.0]\ndatasets:\n - path: openai/gsm8k\n name: main\n type: rewards.oai_gsm8k_transform # format: '{file_name}.{fn_name}'\nTo see other examples of custom reward functions, please see TRL GRPO Docs.\nTo see all configs, please see TRLConfig.\n\n\nGRPO with DAPO/Dr. GRPO loss\nThe DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.\ntrl:\n loss_type: dr_grpo\n # Normalizes loss based on max completion length (default: 256)\n max_completion_length:\nFor more information, see GRPO docs.\n\n\n\nSimPO\nSimPO uses CPOTrainer but with alternative loss function.\nrl: simpo\nrl_beta: 0.1 # default in CPOTrainer\ncpo_alpha: 1.0 # default in CPOTrainer\nsimpo_gamma: 0.5 # default in CPOTrainer\nThis method uses the same dataset format as DPO.\n\n\nUsing local dataset files\ndatasets:\n - ds_type: json\n data_files:\n - orca_rlhf.jsonl\n split: train\n type: chatml.intel\n\n\nTRL auto-unwrapping for PEFT\nTRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config:\n# load ref model when adapter training.\nrl_adapter_ref_model: true", "crumbs": [ "How To Guides", "RLHF (Beta)" @@ -619,7 +619,7 @@ "href": "docs/dataset_loading.html#loading-datasets", "title": "Dataset Loading", "section": "Loading Datasets", - "text": "Loading Datasets\nWe use the datasets library to load datasets and a mix of load_dataset and load_from_disk to load them.\nYou may recognize the similar named configs between load_dataset and the datasets section of the config file.\ndatasets:\n - path:\n name:\n data_files:\n split:\n revision:\n trust_remote_code:\n\n\n\n\n\n\nTip\n\n\n\nDo not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be path and sometimes data_files.\n\n\nThis matches the API of datasets.load_dataset, so if you’re familiar with that, you will feel right at home.\nFor HuggingFace’s guide to load different dataset types, see here.\nFor full details on the config, see config.qmd.\n\n\n\n\n\n\nNote\n\n\n\nYou can set multiple datasets in the config file by more than one entry under datasets.\ndatasets:\n - path: /path/to/your/dataset\n - path: /path/to/your/other/dataset\n\n\n\nLocal dataset\n\nFiles\nUsually, to load a JSON file, you would do something like this:\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"json\", data_files=\"data.json\")\nWhich translates to the following config:\ndatasets:\n - path: json\n data_files: /path/to/your/file.jsonl\nHowever, to make things easier, we have added a few shortcuts for loading local dataset files.\nYou can just point the path to the file or directory along with the ds_type to load the dataset. The below example shows for a JSON file:\ndatasets:\n - path: /path/to/your/file.jsonl\n ds_type: json\nThis works for CSV, JSON, Parquet, and Arrow files.\n\n\n\n\n\n\nTip\n\n\n\nIf path points to a file and ds_type is not specified, we will automatically infer the dataset type from the file extension, so you could omit ds_type if you’d like.\n\n\n\n\nDirectory\nIf you’re loading a directory, you can point the path to the directory.\nThen, you have two options:\n\nLoading entire directory\nYou do not need any additional configs.\nWe will attempt to load in the following order:\n- datasets saved with datasets.save_to_disk\n- loading entire directory of files (such as with parquet/arrow files)\ndatasets:\n - path: /path/to/your/directory\n\n\nLoading specific files in directory\nProvide data_files with a list of files to load.\ndatasets:\n # single file\n - path: /path/to/your/directory\n ds_type: csv\n data_files: file1.csv\n\n # multiple files\n - path: /path/to/your/directory\n ds_type: json\n data_files:\n - file1.jsonl\n - file2.jsonl\n\n # multiple files for parquet\n - path: /path/to/your/directory\n ds_type: parquet\n data_files:\n - file1.parquet\n - file2.parquet\n\n\n\n\nHuggingFace Hub\nThe method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.\n\n\n\n\n\n\nNote\n\n\n\nIf you’re using a private dataset, you will need to enable the hf_use_auth_token flag in the root-level of the config file.\n\n\n\nFolder uploaded\nThis would mean that the dataset is a single file or file(s) uploaded to the Hub.\ndatasets:\n - path: org/dataset-name\n data_files:\n - file1.jsonl\n - file2.jsonl\n\n\nHuggingFace Dataset\nThis means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via datasets.push_to_hub.\ndatasets:\n - path: org/dataset-name\n\n\n\n\n\n\nNote\n\n\n\nThere are some other configs which may be required like name, split, revision, trust_remote_code, etc depending on the dataset.\n\n\n\n\n\nRemote Filesystems\nVia the storage_options config under load_dataset, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.\n\n\n\n\n\n\nWarning\n\n\n\nThis is currently experimental. Please let us know if you run into any issues!\n\n\nThe only difference between the providers is that you need to prepend the path with the respective protocols.\ndatasets:\n # Single file\n - path: s3://bucket-name/path/to/your/file.jsonl\n\n # Directory\n - path: s3://bucket-name/path/to/your/directory\nFor directory, we load via load_from_disk.\n\nS3\nPrepend the path with s3://.\nThe credentials are pulled in the following order:\n\nAWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_SESSION_TOKEN environment variables\nfrom the ~/.aws/credentials file\nfor nodes on EC2, the IAM metadata provider\n\n\n\n\n\n\n\nNote\n\n\n\nWe assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.\n\n\nOther environment variables that can be set can be found in boto3 docs\n\n\nGCS\nPrepend the path with gs:// or gcs://.\nThe credentials are loaded in the following order:\n\ngcloud credentials\nfor nodes on GCP, the google metadata service\nanonymous access\n\n\n\nAzure\n\nGen 1\nPrepend the path with adl://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_TENANT_ID\nAZURE_STORAGE_CLIENT_ID\nAZURE_STORAGE_CLIENT_SECRET\n\n\n\nGen 2\nPrepend the path with abfs:// or az://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_ACCOUNT_NAME\nAZURE_STORAGE_ACCOUNT_KEY\n\nOther environment variables that can be set can be found in adlfs docs\n\n\n\nOCI\nPrepend the path with oci://.\nIt would attempt to read in the following order:\n\nOCIFS_IAM_TYPE, OCIFS_CONFIG_LOCATION, and OCIFS_CONFIG_PROFILE environment variables\nwhen on OCI resource, resource principal\n\nOther environment variables:\n\nOCI_REGION_METADATA\n\nPlease see the ocifs docs.\n\n\n\nHTTPS\nThe path should start with https://.\ndatasets:\n - path: https://path/to/your/dataset/file.jsonl\nThis must be publically accessible.", + "text": "Loading Datasets\nWe use the datasets library to load datasets and a mix of load_dataset and load_from_disk to load them.\nYou may recognize the similar named configs between load_dataset and the datasets section of the config file.\ndatasets:\n - path:\n name:\n data_files:\n split:\n revision:\n trust_remote_code:\n\n\n\n\n\n\nTip\n\n\n\nDo not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be path and sometimes data_files.\n\n\nThis matches the API of datasets.load_dataset, so if you’re familiar with that, you will feel right at home.\nFor HuggingFace’s guide to load different dataset types, see here.\nFor full details on the config, see config.qmd.\n\n\n\n\n\n\nNote\n\n\n\nYou can set multiple datasets in the config file by more than one entry under datasets.\ndatasets:\n - path: /path/to/your/dataset\n - path: /path/to/your/other/dataset\n\n\n\nLocal dataset\n\nFiles\nTo load a JSON file, you would do something like this:\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"json\", data_files=\"data.json\")\nWhich translates to the following config:\ndatasets:\n - path: data.json\n ds_type: json\nIn the example above, it can be seen that we can just point the path to the file or directory along with the ds_type to load the dataset.\nThis works for CSV, JSON, Parquet, and Arrow files.\n\n\n\n\n\n\nTip\n\n\n\nIf path points to a file and ds_type is not specified, we will automatically infer the dataset type from the file extension, so you could omit ds_type if you’d like.\n\n\n\n\nDirectory\nIf you’re loading a directory, you can point the path to the directory.\nThen, you have two options:\n\nLoading entire directory\nYou do not need any additional configs.\nWe will attempt to load in the following order:\n- datasets saved with datasets.save_to_disk\n- loading entire directory of files (such as with parquet/arrow files)\ndatasets:\n - path: /path/to/your/directory\n\n\nLoading specific files in directory\nProvide data_files with a list of files to load.\ndatasets:\n # single file\n - path: /path/to/your/directory\n ds_type: csv\n data_files: file1.csv\n\n # multiple files\n - path: /path/to/your/directory\n ds_type: json\n data_files:\n - file1.jsonl\n - file2.jsonl\n\n # multiple files for parquet\n - path: /path/to/your/directory\n ds_type: parquet\n data_files:\n - file1.parquet\n - file2.parquet\n\n\n\n\nHuggingFace Hub\nThe method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.\n\n\n\n\n\n\nNote\n\n\n\nIf you’re using a private dataset, you will need to enable the hf_use_auth_token flag in the root-level of the config file.\n\n\n\nFolder uploaded\nThis would mean that the dataset is a single file or file(s) uploaded to the Hub.\ndatasets:\n - path: org/dataset-name\n data_files:\n - file1.jsonl\n - file2.jsonl\n\n\nHuggingFace Dataset\nThis means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via datasets.push_to_hub.\ndatasets:\n - path: org/dataset-name\n\n\n\n\n\n\nNote\n\n\n\nThere are some other configs which may be required like name, split, revision, trust_remote_code, etc depending on the dataset.\n\n\n\n\n\nRemote Filesystems\nVia the storage_options config under load_dataset, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.\n\n\n\n\n\n\nWarning\n\n\n\nThis is currently experimental. Please let us know if you run into any issues!\n\n\nThe only difference between the providers is that you need to prepend the path with the respective protocols.\ndatasets:\n # Single file\n - path: s3://bucket-name/path/to/your/file.jsonl\n\n # Directory\n - path: s3://bucket-name/path/to/your/directory\nFor directory, we load via load_from_disk.\n\nS3\nPrepend the path with s3://.\nThe credentials are pulled in the following order:\n\nAWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_SESSION_TOKEN environment variables\nfrom the ~/.aws/credentials file\nfor nodes on EC2, the IAM metadata provider\n\n\n\n\n\n\n\nNote\n\n\n\nWe assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.\n\n\nOther environment variables that can be set can be found in boto3 docs\n\n\nGCS\nPrepend the path with gs:// or gcs://.\nThe credentials are loaded in the following order:\n\ngcloud credentials\nfor nodes on GCP, the google metadata service\nanonymous access\n\n\n\nAzure\n\nGen 1\nPrepend the path with adl://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_TENANT_ID\nAZURE_STORAGE_CLIENT_ID\nAZURE_STORAGE_CLIENT_SECRET\n\n\n\nGen 2\nPrepend the path with abfs:// or az://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_ACCOUNT_NAME\nAZURE_STORAGE_ACCOUNT_KEY\n\nOther environment variables that can be set can be found in adlfs docs\n\n\n\nOCI\nPrepend the path with oci://.\nIt would attempt to read in the following order:\n\nOCIFS_IAM_TYPE, OCIFS_CONFIG_LOCATION, and OCIFS_CONFIG_PROFILE environment variables\nwhen on OCI resource, resource principal\n\nOther environment variables:\n\nOCI_REGION_METADATA\n\nPlease see the ocifs docs.\n\n\n\nHTTPS\nThe path should start with https://.\ndatasets:\n - path: https://path/to/your/dataset/file.jsonl\nThis must be publically accessible.", "crumbs": [ "How To Guides", "Dataset Loading" @@ -3270,7 +3270,7 @@ "href": "docs/lora_optims.html#usage", "title": "LoRA Optimizations", "section": "Usage", - "text": "Usage\nThese optimizations can be enabled in your Axolotl config YAML file. The\nlora_mlp_kernel option enables the optimized MLP path, while lora_qkv_kernel and\nlora_o_kernel enable the fused query-key-value projection and optimized output\nprojection, respectively.\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true", + "text": "Usage\nThese optimizations can be enabled in your Axolotl config YAML file. The\nlora_mlp_kernel option enables the optimized MLP path, while lora_qkv_kernel and\nlora_o_kernel enable the fused query-key-value projection and optimized output\nprojection, respectively.\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n\n\n\n\n\nNote\n\n\n\nCurrently, LoRA kernels are not supported for RLHF training, only SFT.", "crumbs": [ "How To Guides", "LoRA Optimizations" @@ -3380,7 +3380,7 @@ "href": "docs/config.html", "title": "Config Reference", "section": "", - "text": "# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files\n# This can also be a relative path to a model on disk\nbase_model: ./llama-7b-hf\n# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)\nbase_model_ignore_patterns:\n# If the base_model repo on hf hub doesn't include configuration .json files,\n# You can set that here, or leave this empty to default to base_model\nbase_model_config: ./llama-7b-hf\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model:\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config:\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too\nmodel_type: AutoModelForCausalLM\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: AutoTokenizer\n# Trust remote code for untrusted source\ntrust_remote_code:\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast:\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy:\n# Resize the model embeddings when new tokens are added to multiples of 32\n# This is reported to improve training speed on some models\nresize_token_embeddings_to_32x:\n# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings:\n# Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast:\n# Whether to load the model with randomly initialized weights. Useful for\n# pre-training a model from scratch or debugging purposes.\nrandom_init_weights:\n\n# (Internal use only)\n# Used to identify which the model is based on\nis_falcon_derived_model:\nis_llama_derived_model:\nis_qwen_derived_model:\n# Please note that if you set this to true, `padding_side` will be set to \"left\" by default\nis_mistral_derived_model:\n\n# optional overrides to the base model configuration\noverrides_of_model_config:\n # RoPE Scaling https://github.com/huggingface/transformers/pull/24653\n rope_scaling:\n type: # linear | dynamic\n factor: # float\n\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs:\n # use_cache: False\n\n# optional overrides to the bnb 4bit quantization configuration\n# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig\nbnb_config_kwargs:\n # These are default values\n llm_int8_has_fp16_weight: false\n bnb_4bit_quant_type: nf4\n bnb_4bit_use_double_quant: true\n\n\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: true\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: true\n# Use bitsandbytes 4 bit\nload_in_4bit:\n\n# Use CUDA bf16\nbf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere\n# Use CUDA fp16\nfp16: true\n# Use CUDA tf32\ntf32: true # require >=ampere\n# Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting\n\n# No AMP (automatic mixed precision)\nbfloat16: true # require >=ampere\nfloat16: true\n\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset\ngpu_memory_limit: 20GiB\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: true\n\n# List[str]. Add plugins to extend the pipeline.\n# See `src/axolotl/integrations` for the available plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins:\n # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n# A list of one or more datasets to finetune the model with\ndatasets:\n # HuggingFace dataset repo | s3://,gs:// path | \"json\" for local dataset, make sure to fill data_files\n - path: vicgalle/alpaca-gpt4\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>\n ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file\n data_files: # Optional[str] path to source data files\n\n shards: # Optional[int] split dataset into N pieces (use with shards_idx)\n shards_idx: # Optional[int] = 0 the index of sharded dataset to use\n\n preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)\n\n name: # Optional[str] name of dataset configuration to load\n split: train # Optional[str] name of dataset split to load from\n revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.\n trust_remote_code: # Optional[bool] Trust remote code for untrusted source\n\n # Custom user instruction prompt\n - path: repo\n type:\n # The below are defaults. only set what's needed if you use a different column name.\n system_prompt: \"\"\n system_format: \"{system}\"\n field_system: system\n field_instruction: instruction\n field_input: input\n field_output: output\n\n # Customizable to be single line or multi-line\n # Use {instruction}/{input} as key to be replaced\n # 'format' can include {input}\n format: |-\n User: {instruction} {input}\n Assistant:\n # 'no_input_format' cannot include {input}\n no_input_format: \"{instruction} \"\n\n # For `completion` datsets only, uses the provided field instead of `text` column\n field:\n\n # Using chat template\n - path: ...\n # Set type to `chat_template` to use this strategy\n type: chat_template\n # Specify the name of the chat template to use\n # The name of the chat template to use for training, following values are supported:\n # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.\n # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.\n # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n chat_template: tokenizer_default\n\n # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n chat_template_jinja:\n\n # Key containing the messages (default: \"messages\")\n field_messages: messages\n\n # Key containing the system message (default: \"system\")\n # If the system message is not present in the dataset sample, it will be loaded from the field_system property.\n field_system: system\n\n # Mapping of properties from the input dataset to the chat template.\n # (default: message_property_mappings={'role':'role', 'content':'content'})\n # If a property exists in the template but not in this mapping, the system will attempt\n # to load it directly from the message using the property name as the key.\n # Example: In the mapping below, 'from' is loaded from input dataset and used as 'role',\n # while 'value' is loaded and used as 'content' in the chat template.\n message_property_mappings:\n role: from\n content: value\n # ...\n\n # Optional[Dict[str, List]]. Roles mapping in the messages.\n # The format is {target_role: [source_roles]}. All source roles will be mapped to the target role.\n # The default is:\n roles:\n user: [\"human\", \"user\"]\n assistant: [\"gpt\", \"assistant\"]\n system: [\"system\"]\n tool: [\"tool\"]\n\n # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If you wish to,\n # we recommend using a custom jinja template with the default system message removed or\n # adding a system turn with empty content.\n drop_system_message:\n\n # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags\n # See example at `docs/dataset-formats/conversation.qmd`\n split_thinking:\n\n # IMPORTANT: The following fields determine which parts of the conversation to train on.\n # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train\n # See examples at `docs/dataset-formats/conversation.qmd`\n # Note: If the below 5 fields are empty, defaults to training only on the last message.\n\n # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: [\"assistant\"] # default\n # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:\n # - all: train on all EOS tokens\n # - turn (default): train on the EOS token at the end of each trainable turn\n # - last: train on the last EOS token in the conversation\n # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.\n train_on_eos: turn\n # Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are:\n # - all: train on all EOT tokens\n # - turn: train on the EOT token at the end of each trainable turn\n # - last: train on the last EOT token in the conversation\n # If not specified, defaults to the value of train_on_eos for backward compatibility.\n train_on_eot:\n # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.\n message_field_training: training\n # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.\n # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).\n message_field_training_detail: train_detail\n\n\n# If false, the datasets will not be shuffled and will keep their original order in `datasets`.\n# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: true\n\nDeduplicates datasets and test_datasets with identical entries.\ndataset_exact_deduplication: true\n\n# A list of one or more datasets to eval the model with.\n# You can use either test_datasets, or val_set_size, but not both.\ntest_datasets:\n - path: /workspace/data/eval.jsonl\n ds_type: json\n # You need to specify a split. For \"json\" datasets the default split is called \"train\".\n split: train\n type: completion\n data_files:\n - /workspace/data/eval.jsonl\n\n# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl:\nrl_beta: # Optional[float]. The beta parameter for the RL training.\n\n# dpo\ndpo_use_weighting: # Optional[bool]. Whether to perform weighting.\nrpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper.\n\n# orpo\norpo_alpha: 0.1 # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping.\n\n# kto\nkto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss.\nkto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss.\n\n# simpo\ncpo_alpha: 1.0 # Weight of the BC regularizer\nsimpo_gamma: 0.5 # Target reward margin for the SimPO loss\n\n# grpo\ntrl:\n use_vllm: # Optional[bool]. Whether to use VLLM for RL training.\n vllm_server_host: # Optional[str]. Host of the vLLM server to connect to.\n vllm_server_port: # Optional[int]. Port of the vLLM server to connect to.\n vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond.\n vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding.\n\n beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use\n max_completion_length: # Optional[int]. Maximum length of the completion for RL training.\n\n reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir.\n reward_weights: # Optional[list[float]]. List of reward weights for the reward functions.\n\n num_generations: # Optional[int]. Number of generations to sample.\n log_completions: # Optional[bool]. Whether to log completions.\n\n sync_ref_model: # Optional[bool]. Whether to sync the reference model.\n ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.\n ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.\n\n\n# reward modelling: `True` or `False`\nreward_model:\n\n# process reward modelling: `True` or `False`\nprocess_reward_model:\n\n# The name of the chat template to use for training, following values are supported:\n# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.\n# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.\n# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n# The selected chat template will be saved to the tokenizer_config.json for easier inferencing\n# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template.\nchat_template: tokenizer_default\n# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.\nchat_template_jinja: null\n# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training.\n# These tokens mark the boundaries between conversation turns.\n# For example: [\"/INST\", \"</s>\", \"[/SYSTEM_PROMPT]\"]\n# If not specified, defaults to just the model's eos_token.\n# This is useful for templates that use multiple delimiter tokens.\neot_tokens:\n # - \"</s>\"\n # - \"[/INST]\"\n # - \"[/SYSTEM_PROMPT]\"\n# Changes the default system message\ndefault_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: data/last_run_prepared\n# Push prepared dataset to hub\npush_dataset_to_hub: # Optional[str] repo_org/repo_name\n# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`\n# if not set.\ndataset_processes: # defaults to os.cpu_count() if not set\n# Keep dataset in memory while preprocessing\n# Only needed if cached dataset is taking too much storage\ndataset_keep_in_memory:\n# push checkpoints to hub\nhub_model_id: # private repo path to push finetuned model\n# how to push checkpoints to hub\n# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy\nhub_strategy:\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets\n# Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: # boolean\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.\nval_set_size: 0.04\n# Num shards for whole dataset\ndataset_shard_num:\n# Index of shard to use for whole dataset\ndataset_shard_idx:\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: 2048\n# Pad inputs so each step uses constant sized buffers\n# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently\npad_to_sequence_len:\n# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'\nsample_packing:\n# Set to 'false' if getting errors during eval with sample_packing on.\neval_sample_packing:\n# You can set these packing optimizations AFTER starting a training at least once.\n# The trainer will provide recommended values for these values.\nsample_packing_eff_est:\ntotal_num_tokens:\n# Increasing the following values helps with packing, but usually only slightly (<%1.)\n# The number of samples packed at a time.\nsample_packing_group_size: 100000\n# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.\nsample_packing_bin_size: 200\nsample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.\n\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation:\n\ncurriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening:\n\n# Passed through to transformers when loading the model when launched without accelerate\n# Use `sequential` when training w/ model parallelism to limit memory\ndevice_map:\n# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.\nmax_memory:\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model\nadapter: lora\n# If you already have a lora model trained that you want to load, put that here.\n# This means after training, if you want to test the model, you should set this to the value of `output_dir`.\n# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir:\n\n# LoRA hyperparameters\n# For more details about the following options, see:\n# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2\nlora_r: 8\nlora_alpha: 16\nlora_dropout: 0.05\nlora_target_modules:\n - q_proj\n - v_proj\n# - k_proj\n# - o_proj\n# - gate_proj\n# - down_proj\n# - up_proj\nlora_target_linear: # If true, will target all linear modules\n\n# List[int] | int. # The layer indices to transform, otherwise, apply to all layers\n# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform\npeft_layers_to_transform:\n\n# Optional[bool]. Whether to use DoRA.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora\npeft_use_dora:\n\n# Optional[bool]. Whether to use RSLoRA.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora\npeft_use_rslora:\n\n# Optional[list[tuple[int, int]]]. List of layer indices to replicate.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora\npeft_layer_replication:\n\n# bool | Literal[\"gaussian\", \"eva\", \"olora\", \"pissa\", \"pissa_niter_[number of iters]\", \"corda\", \"loftq\"]\n# How to initialize LoRA weights. Default to True which is MS original implementation.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization\npeft_init_lora_weights:\n\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.\n# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\n# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994\nlora_modules_to_save:\n# - embed_tokens\n# - lm_head\n\nlora_fan_in_fan_out: false\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for\n# speed and memory savings\n# See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n# LoRA+ hyperparameters\n# For more details about the following options, see:\n# https://arxiv.org/abs/2402.12354 and `src/axolotl/core/train_builder.py`\nloraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_embedding: # loraplus learning rate for lora embedding layers. Default value is 1e-6.\n\npeft:\n # Configuration options for loftq initialization for LoRA\n # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization\n loftq_config:\n loftq_bits: # typically 4 bits\n\n# ReLoRA configuration\n# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed\nrelora_steps: # Number of steps per ReLoRA restart\nrelora_warmup_steps: # Number of per-restart warmup steps\nrelora_anneal_steps: # Number of anneal steps for each relora cycle\nrelora_prune_ratio: # threshold for optimizer magnitude when pruning\nrelora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings\n\n# wandb configuration if you're using it\n# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.\nwandb_mode: # \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn off wandb\nwandb_project: # Your wandb project name\nwandb_entity: # A wandb Team name if using a Team\nwandb_watch:\nwandb_name: # Set the name of your wandb run\nwandb_run_id: # Set the ID of your wandb run\nwandb_log_model: # \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only at the end of training\n\n# mlflow configuration if you're using it\nmlflow_tracking_uri: # URI to mlflow\nmlflow_experiment_name: # Your experiment name\nmlflow_run_name: # Your run name\nhf_mlflow_log_artifacts: # set to true to copy each saved checkpoint on each save to mlflow artifact registry\n\n# Comet configuration if you're using it\n# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`.\n# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start\nuse_comet: # Enable or disable Comet integration.\ncomet_api_key: # API key for Comet. Recommended to set via `comet login`.\ncomet_workspace: # Workspace name in Comet. Defaults to the user's default workspace.\ncomet_project_name: # Project name in Comet. Defaults to Uncategorized.\ncomet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key.\ncomet_mode: # Create a new experiment (\"create\") or log to an existing one (\"get\"). Default (\"get_or_create\") auto-selects based on configuration.\ncomet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True.\ncomet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details.\n\n# Tensorboard\nuse_tensorboard: # Optional[bool]\n\n# Where to save the full-finetuned model to\noutput_dir: ./completed-model\n\n# Whether to use torch.compile and which backend to use\n# setting to `auto` will enable torch compile when torch>=2.5.1\ntorch_compile: # Optional[Union[Literal[\"auto\"], bool]]\ntorch_compile_backend: # Optional[str]\n\n# Training hyperparameters\n\n# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.\ngradient_accumulation_steps: 1\n# The number of samples to include in each batch. This is the number of samples sent to each GPU.\n# Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: 2\neval_batch_size:\nnum_epochs: 4\nwarmup_steps: 100 # cannot use with warmup_ratio\nwarmup_ratio: 0.05 # cannot use with warmup_steps\nlearning_rate: 0.00003\nlr_quadratic_warmup:\nlogging_steps:\neval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps\nevals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps\neval_strategy: # Set to `\"no\"` to skip evaluation, `\"epoch\"` at end of each epoch, leave empty to infer from `eval_steps`.\nsave_strategy: # Set to `\"no\"` to skip checkpoint saves, `\"epoch\"` at end of each epoch, `\"best\"` when better result is achieved, leave empty to infer from `save_steps`.\nsave_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps\nsaves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsave_total_limit: # Checkpoints saved at a time\nsave_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.\n# Maximum number of iterations to train for. It precedes num_epochs which means that\n# if both are set, num_epochs will not be guaranteed.\n# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps:\n\n# bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time.\ninclude_tokens_per_second: # Optional[bool]\n\n# whether to find batch size that fits in memory. Passed to underlying transformers Trainer\nauto_find_batch_size: # Optional[bool]\n\neval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0\neval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128\ndo_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`.\neval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is [\"sacrebleu\", \"comet\", \"ter\", \"chrf\", \"perplexity\"]\n\nprofiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.\n # see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information\n # snapshots can be visualized @ https://pytorch.org/memory_viz\n\nloss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)\nloss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)\n\n# Save model as safetensors (require safetensors package)\nsave_safetensors:\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: false\n# Group similarly sized data to minimize padding.\n# May be slower to start, as it must download and sort the entire dataset.\n# Note that training loss may have an oscillating pattern with this enabled.\ngroup_by_length: false\n\n# Whether to use gradient checkpointing. Available options are: true, false, \"offload\", \"offload_disk\".\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: false\n# additional kwargs to pass to the trainer for gradient checkpointing\n# gradient_checkpointing_kwargs:\n# use_reentrant: true\n\n# Stop training after this many evaluation losses have increased in a row\n# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback\nearly_stopping_patience: 3\n\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine\nlr_scheduler_kwargs:\ncosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr\ncosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)\n\n# For one_cycle optim\nlr_div_factor: # Learning rate div factor\n\n# Specify optimizer\n# Valid values are driven by the Transformers OptimizerNames class, see:\n# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189\n#\n# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of\n# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used\n# in the examples/ for your model and fine-tuning use case.\n#\n# Valid values for 'optimizer' include:\n# - adamw_torch\n# - adamw_torch_fused\n# - adamw_torch_xla\n# - adamw_torch_npu_fused\n# - adamw_apex_fused\n# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)\n# - adafactor\n# - adamw_anyprecision\n# - adamw_torch_4bit\n# - ademamix\n# - sgd\n# - adagrad\n# - adamw_bnb_8bit\n# - adamw_8bit # alias for adamw_bnb_8bit\n# - ademamix_8bit\n# - lion_8bit\n# - lion_32bit\n# - paged_adamw_32bit\n# - paged_adamw_8bit\n# - paged_ademamix_32bit\n# - paged_ademamix_8bit\n# - paged_lion_32bit\n# - paged_lion_8bit\n# - rmsprop\n# - rmsprop_bnb\n# - rmsprop_bnb_8bit\n# - rmsprop_bnb_32bit\n# - galore_adamw\n# - galore_adamw_8bit\n# - galore_adafactor\n# - galore_adamw_layerwise\n# - galore_adamw_8bit_layerwise\n# - galore_adafactor_layerwise\n# - lomo\n# - adalomo\n# - grokadamw\n# - schedule_free_adamw\n# - schedule_free_sgd\n# - apollo_adamw\n# - apollo_adamw_layerwise\n#\n# Additional custom optimizers include:\n# - optimi_adamw\n# - ao_adamw_8bit\n# - ao_adamw_fp8\n# - came_pytorch\noptimizer:\n# Dictionary of arguments to pass to the optimizer\noptim_args:\n# For Galore Optimizers the following optim_args are available\n# rank: # type: int\n# update_proj_gap # type: int\n# scale # type: float\n# proj_type: # type: str, default = std\n\n# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm\noptim_target_modules:\n# - self_attn # for llama\n# - mlp\n\n# Specify weight decay\nweight_decay:\n# adamw hyperparams\nadam_beta1:\nadam_beta2:\nadam_beta3: # only used for CAME Optimizer\nadam_epsilon:\nadam_epsilon2: # only used for CAME Optimizer\n# Gradient clipping max norm\nmax_grad_norm:\n\n# Augmentation techniques\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings\n# currently only supported on Llama and Mistral\nneftune_noise_alpha:\n\n# Optional[bool]. Whether to bettertransformers\nflash_optimum:\n\n# Note: Only one of the following attention patches can be used at a time.\n# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.\n\n# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:\nxformers_attention:\n# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:\nflash_attention:\nflash_attn_cross_entropy: # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_rms_norm: # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation\nflash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation\n# Optional[bool]. Whether to use scaled-dot-product attention\n# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention:\n# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention:\n\n# Optional[bool]. Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage:\n# Optional[str]. Resume from a specific checkpoint dir\nresume_from_checkpoint:\n# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: false\n\n## Multimodal section\n# int | tuple[int, int] | None . Size to resize images to, width x height.\n# Will read from model/processor config if not set.\nimage_size:\n# str. Algorithm to use for image resizing. \"bilinear\", \"bicubic\", \"lanczos\". Default is \"bilinear\".\nimage_resize_algorithm: 'bilinear'\n## End of multimodal section\n\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank:\n\n# Add or change special tokens.\n# If you add tokens here, you don't need to add them to the `tokens` list.\nspecial_tokens:\n # bos_token: \"<s>\"\n # eos_token: \"</s>\"\n # unk_token: \"<unk>\"\n # pad_token: \"[PAD]\"\n\n# Optional[list[str]]. Add extra tokens to the tokenizer.\ntokens:\n # - \"<|startoftext|>\"\n # - \"<|endoftext|>\"\n\n# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.\n# Only works for tokens that are not part of the base vocab (aka are added_tokens).\n# Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: # Dict[int, str]\n# 128041: \"<|im_start|>\"\n# 128042: \"<|im_end|>\"\n\n# FSDP\nfsdp:\nfsdp_config:\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed:\n\n# Advanced DDP Arguments\nddp_timeout:\nddp_bucket_cap_mb:\nddp_broadcast_buffers:\n\n# Sequence parallelism\n# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.\n# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.\n# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized\n# subsequences, or set to 4 to split into four equal-sized subsequences.\n# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.\nsequence_parallel_degree:\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\n# Must evenly divide the number of KV heads in your model.\nheads_k_stride: 1\n# One of \"varlen_llama3\", \"batch_ring\", \"batch_zigzag\", \"batch_stripe\". Defaults to \"varlen_llama3\"\n# in the sample packing case, and \"batch_ring\" in the non-sample packing case.\nring_attn_func:\n\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path:\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset:\n\n# Debug mode\ndebug:\n\n# Seed\nseed:\n\n# Allow overwrite yml config using from cli\nstrict:", + "text": "# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files\n# This can also be a relative path to a model on disk\nbase_model: ./llama-7b-hf\n# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)\nbase_model_ignore_patterns:\n# If the base_model repo on hf hub doesn't include configuration .json files,\n# You can set that here, or leave this empty to default to base_model\nbase_model_config: ./llama-7b-hf\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model:\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config:\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too\nmodel_type: AutoModelForCausalLM\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: AutoTokenizer\n# Trust remote code for untrusted source\ntrust_remote_code:\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast:\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy:\n# Resize the model embeddings when new tokens are added to multiples of 32\n# This is reported to improve training speed on some models\nresize_token_embeddings_to_32x:\n# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings:\n# Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast:\n# Whether to load the model with randomly initialized weights. Useful for\n# pre-training a model from scratch or debugging purposes.\nrandom_init_weights:\n\n# (Internal use only)\n# Used to identify which the model is based on\nis_falcon_derived_model:\nis_llama_derived_model:\nis_qwen_derived_model:\n# Please note that if you set this to true, `padding_side` will be set to \"left\" by default\nis_mistral_derived_model:\n\n# optional overrides to the base model configuration\noverrides_of_model_config:\n # RoPE Scaling https://github.com/huggingface/transformers/pull/24653\n rope_scaling:\n type: # linear | dynamic\n factor: # float\n\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs:\n # use_cache: False\n\n# optional overrides to the bnb 4bit quantization configuration\n# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig\nbnb_config_kwargs:\n # These are default values\n llm_int8_has_fp16_weight: false\n bnb_4bit_quant_type: nf4\n bnb_4bit_use_double_quant: true\n\n\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: true\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: true\n# Use bitsandbytes 4 bit\nload_in_4bit:\n\n# Use CUDA bf16\nbf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere\n# Use CUDA fp16\nfp16: true\n# Use CUDA tf32\ntf32: true # require >=ampere\n# Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting\n\n# No AMP (automatic mixed precision)\nbfloat16: true # require >=ampere\nfloat16: true\n\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset\ngpu_memory_limit: 20GiB\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: true\n\n# List[str]. Add plugins to extend the pipeline.\n# See `src/axolotl/integrations` for the available plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins:\n # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n# A list of one or more datasets to finetune the model with\n# See https://docs.axolotl.ai/docs/dataset_loading.html for guide on loading datasets\n# See https://docs.axolotl.ai/docs/dataset-formats/ for guide on dataset formats\ndatasets:\n # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n - path: vicgalle/alpaca-gpt4\n # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>\n ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file\n data_files: # Optional[str] path to source data files\n\n shards: # Optional[int] split dataset into N pieces (use with shards_idx)\n shards_idx: # Optional[int] = 0 the index of sharded dataset to use\n\n preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)\n\n name: # Optional[str] name of dataset configuration to load\n split: train # Optional[str] name of dataset split to load from\n revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.\n trust_remote_code: # Optional[bool] Trust remote code for untrusted source\n\n # Custom user instruction prompt\n - path: repo\n type:\n # The below are defaults. only set what's needed if you use a different column name.\n system_prompt: \"\"\n system_format: \"{system}\"\n field_system: system\n field_instruction: instruction\n field_input: input\n field_output: output\n\n # Customizable to be single line or multi-line\n # Use {instruction}/{input} as key to be replaced\n # 'format' can include {input}\n format: |-\n User: {instruction} {input}\n Assistant:\n # 'no_input_format' cannot include {input}\n no_input_format: \"{instruction} \"\n\n # For `completion` datsets only, uses the provided field instead of `text` column\n field:\n\n # Using chat template\n - path: ...\n # Set type to `chat_template` to use this strategy\n type: chat_template\n # Specify the name of the chat template to use\n # The name of the chat template to use for training, following values are supported:\n # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.\n # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.\n # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n chat_template: tokenizer_default\n\n # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n chat_template_jinja:\n\n # Key containing the messages (default: \"messages\")\n field_messages: messages\n\n # Key containing the system message (default: \"system\")\n # If the system message is not present in the dataset sample, it will be loaded from the field_system property.\n field_system: system\n\n # Mapping of properties from the input dataset to the chat template.\n # (default: message_property_mappings={'role':'role', 'content':'content'})\n # If a property exists in the template but not in this mapping, the system will attempt\n # to load it directly from the message using the property name as the key.\n # Example: In the mapping below, 'from' is loaded from input dataset and used as 'role',\n # while 'value' is loaded and used as 'content' in the chat template.\n message_property_mappings:\n role: from\n content: value\n # ...\n\n # Optional[Dict[str, List]]. Roles mapping in the messages.\n # The format is {target_role: [source_roles]}. All source roles will be mapped to the target role.\n # The default is:\n roles:\n user: [\"human\", \"user\"]\n assistant: [\"gpt\", \"assistant\"]\n system: [\"system\"]\n tool: [\"tool\"]\n\n # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template.\n # This does not drop the default system message from chat_template if it exists. If you wish to,\n # we recommend using a custom jinja template with the default system message removed or\n # adding a system turn with empty content.\n drop_system_message:\n\n # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags\n # See example at `docs/dataset-formats/conversation.qmd`\n split_thinking:\n\n # IMPORTANT: The following fields determine which parts of the conversation to train on.\n # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train\n # See examples at `docs/dataset-formats/conversation.qmd`\n # Note: If the below 5 fields are empty, defaults to training only on the last message.\n\n # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.\n roles_to_train: [\"assistant\"] # default\n # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:\n # - all: train on all EOS tokens\n # - turn (default): train on the EOS token at the end of each trainable turn\n # - last: train on the last EOS token in the conversation\n # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.\n train_on_eos: turn\n # Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are:\n # - all: train on all EOT tokens\n # - turn: train on the EOT token at the end of each trainable turn\n # - last: train on the last EOT token in the conversation\n # If not specified, defaults to the value of train_on_eos for backward compatibility.\n train_on_eot:\n # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.\n message_field_training: training\n # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.\n # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).\n message_field_training_detail: train_detail\n\n\n# If false, the datasets will not be shuffled and will keep their original order in `datasets`.\n# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: true\n\n# Deduplicates datasets and test_datasets with identical entries.\ndataset_exact_deduplication: true\n\n# A list of one or more datasets to eval the model with.\n# You can use either test_datasets, or val_set_size, but not both.\ntest_datasets:\n - path: /workspace/data/eval.jsonl\n ds_type: json\n # You need to specify a split. For \"json\" datasets the default split is called \"train\".\n split: train\n type: completion\n data_files:\n - /workspace/data/eval.jsonl\n\n# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl:\nrl_beta: # Optional[float]. The beta parameter for the RL training.\n\n# dpo\ndpo_use_weighting: # Optional[bool]. Whether to perform weighting.\nrpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper.\n\n# orpo\norpo_alpha: 0.1 # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping.\n\n# kto\nkto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss.\nkto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss.\n\n# simpo\ncpo_alpha: 1.0 # Weight of the BC regularizer\nsimpo_gamma: 0.5 # Target reward margin for the SimPO loss\n\n# grpo\ntrl:\n use_vllm: # Optional[bool]. Whether to use VLLM for RL training.\n vllm_server_host: # Optional[str]. Host of the vLLM server to connect to.\n vllm_server_port: # Optional[int]. Port of the vLLM server to connect to.\n vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond.\n vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding.\n\n beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use\n max_completion_length: # Optional[int]. Maximum length of the completion for RL training.\n\n reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir.\n reward_weights: # Optional[list[float]]. List of reward weights for the reward functions.\n\n num_generations: # Optional[int]. Number of generations to sample.\n log_completions: # Optional[bool]. Whether to log completions.\n num_completions_to_print: # Optional[int]. Number of completions to print when log_completions is True.\n\n sync_ref_model: # Optional[bool]. Whether to sync the reference model.\n ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.\n ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.\n scale_rewards: # Optional[bool]. Whether to scale rewards by their standard deviation.\n\n temperature: # Optional[float]. Sampling temperature for the GRPO policy.\n top_p: # Optional[float]. Top-p sampling probability for the generation policy.\n top_k: # Optional[int]. Top-k sampling for the generation policy.\n min_p: # Optional[float]. Minimum probability for the generation policy.\n repetition_penalty: # Optional[float]. Penalty for tokens that appear in prompt and generated text.\n\n num_iterations: # Optional[int]. Number of iterations per batch (μ) for GRPO.\n epsilon: # Optional[float]. Epsilon value for clipping in the GRPO algorithm.\n epsilon_high: # Optional[float]. Upper-bound epsilon value for clipping in the GRPO algorithm.\n use_liger_loss: # Optional[bool]. Whether to use Liger loss for GRPO.\n loss_type: # Optional[str]. Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n mask_truncated_completions: # Optional[bool]. Whether to exclude truncated completions from loss calculation.\n\n\n# reward modelling: `True` or `False`\nreward_model:\n\n# process reward modelling: `True` or `False`\nprocess_reward_model:\n\n# The name of the chat template to use for training, following values are supported:\n# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.\n# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.\n# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n# The selected chat template will be saved to the tokenizer_config.json for easier inferencing\n# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template.\nchat_template: tokenizer_default\n# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.\nchat_template_jinja: null\n# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training.\n# These tokens mark the boundaries between conversation turns.\n# For example: [\"/INST\", \"</s>\", \"[/SYSTEM_PROMPT]\"]\n# If not specified, defaults to just the model's eos_token.\n# This is useful for templates that use multiple delimiter tokens.\neot_tokens:\n # - \"</s>\"\n # - \"[/INST]\"\n # - \"[/SYSTEM_PROMPT]\"\n# Changes the default system message\ndefault_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: data/last_run_prepared\n# Push prepared dataset to hub\npush_dataset_to_hub: # Optional[str] repo_org/repo_name\n# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`\n# if not set.\ndataset_processes: # defaults to os.cpu_count() if not set\n# Keep dataset in memory while preprocessing\n# Only needed if cached dataset is taking too much storage\ndataset_keep_in_memory:\n# push checkpoints to hub\nhub_model_id: # private repo path to push finetuned model\n# how to push checkpoints to hub\n# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy\nhub_strategy:\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets\n# Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: # boolean\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.\nval_set_size: 0.04\n# Num shards for whole dataset\ndataset_shard_num:\n# Index of shard to use for whole dataset\ndataset_shard_idx:\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: 2048\n# Pad inputs so each step uses constant sized buffers\n# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently\npad_to_sequence_len:\n# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'\nsample_packing:\n# Set to 'false' if getting errors during eval with sample_packing on.\neval_sample_packing:\n# You can set these packing optimizations AFTER starting a training at least once.\n# The trainer will provide recommended values for these values.\nsample_packing_eff_est:\ntotal_num_tokens:\n# Increasing the following values helps with packing, but usually only slightly (<%1.)\n# The number of samples packed at a time.\nsample_packing_group_size: 100000\n# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.\nsample_packing_bin_size: 200\nsample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.\n\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation:\n\ncurriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening:\n\n# Passed through to transformers when loading the model when launched without accelerate\n# Use `sequential` when training w/ model parallelism to limit memory\ndevice_map:\n# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.\nmax_memory:\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model\nadapter: lora\n# If you already have a lora model trained that you want to load, put that here.\n# This means after training, if you want to test the model, you should set this to the value of `output_dir`.\n# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir:\n\n# LoRA hyperparameters\n# For more details about the following options, see:\n# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2\nlora_r: 8\nlora_alpha: 16\nlora_dropout: 0.05\nlora_target_modules:\n - q_proj\n - v_proj\n# - k_proj\n# - o_proj\n# - gate_proj\n# - down_proj\n# - up_proj\nlora_target_linear: # If true, will target all linear modules\n\n# List[int] | int. # The layer indices to transform, otherwise, apply to all layers\n# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform\npeft_layers_to_transform:\n\n# Optional[bool]. Whether to use DoRA.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora\npeft_use_dora:\n\n# Optional[bool]. Whether to use RSLoRA.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora\npeft_use_rslora:\n\n# Optional[list[tuple[int, int]]]. List of layer indices to replicate.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora\npeft_layer_replication:\n\n# bool | Literal[\"gaussian\", \"eva\", \"olora\", \"pissa\", \"pissa_niter_[number of iters]\", \"corda\", \"loftq\"]\n# How to initialize LoRA weights. Default to True which is MS original implementation.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization\npeft_init_lora_weights:\n\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.\n# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\n# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994\nlora_modules_to_save:\n# - embed_tokens\n# - lm_head\n\nlora_fan_in_fan_out: false\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for\n# speed and memory savings\n# See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n# LoRA+ hyperparameters\n# For more details about the following options, see:\n# https://arxiv.org/abs/2402.12354 and `src/axolotl/core/train_builder.py`\nloraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_embedding: # loraplus learning rate for lora embedding layers. Default value is 1e-6.\n\npeft:\n # Configuration options for loftq initialization for LoRA\n # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization\n loftq_config:\n loftq_bits: # typically 4 bits\n\n# ReLoRA configuration\n# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed\nrelora_steps: # Number of steps per ReLoRA restart\nrelora_warmup_steps: # Number of per-restart warmup steps\nrelora_anneal_steps: # Number of anneal steps for each relora cycle\nrelora_prune_ratio: # threshold for optimizer magnitude when pruning\nrelora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings\n\n# wandb configuration if you're using it\n# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.\nwandb_mode: # \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn off wandb\nwandb_project: # Your wandb project name\nwandb_entity: # A wandb Team name if using a Team\nwandb_watch:\nwandb_name: # Set the name of your wandb run\nwandb_run_id: # Set the ID of your wandb run\nwandb_log_model: # \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only at the end of training\n\n# mlflow configuration if you're using it\nmlflow_tracking_uri: # URI to mlflow\nmlflow_experiment_name: # Your experiment name\nmlflow_run_name: # Your run name\nhf_mlflow_log_artifacts: # set to true to copy each saved checkpoint on each save to mlflow artifact registry\n\n# Comet configuration if you're using it\n# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`.\n# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start\nuse_comet: # Enable or disable Comet integration.\ncomet_api_key: # API key for Comet. Recommended to set via `comet login`.\ncomet_workspace: # Workspace name in Comet. Defaults to the user's default workspace.\ncomet_project_name: # Project name in Comet. Defaults to Uncategorized.\ncomet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key.\ncomet_mode: # Create a new experiment (\"create\") or log to an existing one (\"get\"). Default (\"get_or_create\") auto-selects based on configuration.\ncomet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True.\ncomet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details.\n\n# Tensorboard\nuse_tensorboard: # Optional[bool]\n\n# Where to save the full-finetuned model to\noutput_dir: ./completed-model\n\n# Whether to use torch.compile and which backend to use\n# setting to `auto` will enable torch compile when torch>=2.5.1\ntorch_compile: # Optional[Union[Literal[\"auto\"], bool]]\ntorch_compile_backend: # Optional[str]\n\n# Training hyperparameters\n\n# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.\ngradient_accumulation_steps: 1\n# The number of samples to include in each batch. This is the number of samples sent to each GPU.\n# Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: 2\neval_batch_size:\nnum_epochs: 4\nwarmup_steps: 100 # cannot use with warmup_ratio\nwarmup_ratio: 0.05 # cannot use with warmup_steps\nlearning_rate: 0.00003\nlr_quadratic_warmup:\nlogging_steps:\neval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps\nevals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps\neval_strategy: # Set to `\"no\"` to skip evaluation, `\"epoch\"` at end of each epoch, leave empty to infer from `eval_steps`.\nsave_strategy: # Set to `\"no\"` to skip checkpoint saves, `\"epoch\"` at end of each epoch, `\"best\"` when better result is achieved, leave empty to infer from `save_steps`.\nsave_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps\nsaves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsave_total_limit: # Checkpoints saved at a time\nsave_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.\n# Maximum number of iterations to train for. It precedes num_epochs which means that\n# if both are set, num_epochs will not be guaranteed.\n# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps:\n\n# bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time.\ninclude_tokens_per_second: # Optional[bool]\n\n# whether to find batch size that fits in memory. Passed to underlying transformers Trainer\nauto_find_batch_size: # Optional[bool]\n\neval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0\neval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128\ndo_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`.\neval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is [\"sacrebleu\", \"comet\", \"ter\", \"chrf\", \"perplexity\"]\n\nprofiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.\n # see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information\n # snapshots can be visualized @ https://pytorch.org/memory_viz\n\nloss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)\nloss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)\n\n# Save model as safetensors (require safetensors package)\nsave_safetensors:\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: false\n# Group similarly sized data to minimize padding.\n# May be slower to start, as it must download and sort the entire dataset.\n# Note that training loss may have an oscillating pattern with this enabled.\ngroup_by_length: false\n\n# Whether to use gradient checkpointing. Available options are: true, false, \"offload\", \"offload_disk\".\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: false\n# additional kwargs to pass to the trainer for gradient checkpointing\n# gradient_checkpointing_kwargs:\n# use_reentrant: true\n\n# Stop training after this many evaluation losses have increased in a row\n# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback\nearly_stopping_patience: 3\n\n# Specify a scheduler and kwargs to use with the optimizer\n# Valid values are driven by the Transformers SchedulerType class, see:\n# https://github.com/huggingface/transformers/blob/5f4ecf2d9f867a1255131d2461d75793c0cf1db2/src/transformers/trainer_utils.py#L420\n# Valid values include\n# - 'linear'\n# - 'cosine' (default)\n# - 'cosine_with_restarts'\n# - 'polynomial'\n# - 'constant'\n# - 'constant_with_warmup'\n# - 'inverse_sqrt'\n# - 'reduce_lr_on_plateau'\n# - 'cosine_with_min_lr'\n# - 'warmup_stable_decay'\n\n# Additional schedulers include:\n# - 'one_cycle'\n# - 'rex'\nlr_scheduler:\nlr_scheduler_kwargs:\ncosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr\ncosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)\n\n# For one_cycle optim\nlr_div_factor: # Learning rate div factor\n\n# Specify optimizer\n# Valid values are driven by the Transformers OptimizerNames class, see:\n# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189\n#\n# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of\n# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used\n# in the examples/ for your model and fine-tuning use case.\n#\n# Valid values for 'optimizer' include:\n# - adamw_torch\n# - adamw_torch_fused (default)\n# - adamw_torch_xla\n# - adamw_torch_npu_fused\n# - adamw_apex_fused\n# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)\n# - adafactor\n# - adamw_anyprecision\n# - adamw_torch_4bit\n# - ademamix\n# - sgd\n# - adagrad\n# - adamw_bnb_8bit\n# - adamw_8bit # alias for adamw_bnb_8bit\n# - ademamix_8bit\n# - lion_8bit\n# - lion_32bit\n# - paged_adamw_32bit\n# - paged_adamw_8bit\n# - paged_ademamix_32bit\n# - paged_ademamix_8bit\n# - paged_lion_32bit\n# - paged_lion_8bit\n# - rmsprop\n# - rmsprop_bnb\n# - rmsprop_bnb_8bit\n# - rmsprop_bnb_32bit\n# - galore_adamw\n# - galore_adamw_8bit\n# - galore_adafactor\n# - galore_adamw_layerwise\n# - galore_adamw_8bit_layerwise\n# - galore_adafactor_layerwise\n# - lomo\n# - adalomo\n# - grokadamw\n# - schedule_free_adamw\n# - schedule_free_sgd\n# - apollo_adamw\n# - apollo_adamw_layerwise\n#\n# Additional custom optimizers include:\n# - optimi_adamw\n# - ao_adamw_8bit\n# - ao_adamw_fp8\n# - came_pytorch\noptimizer:\n# Dictionary of arguments to pass to the optimizer\noptim_args:\n# For Galore Optimizers the following optim_args are available\n# rank: # type: int\n# update_proj_gap # type: int\n# scale # type: float\n# proj_type: # type: str, default = std\n\n# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm\noptim_target_modules:\n# - self_attn # for llama\n# - mlp\n\n# Specify weight decay\nweight_decay:\n# adamw hyperparams\nadam_beta1:\nadam_beta2:\nadam_beta3: # only used for CAME Optimizer\nadam_epsilon:\nadam_epsilon2: # only used for CAME Optimizer\n# Gradient clipping max norm\nmax_grad_norm:\n\n# Augmentation techniques\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings\n# currently only supported on Llama and Mistral\nneftune_noise_alpha:\n\n# Optional[bool]. Whether to bettertransformers\nflash_optimum:\n\n# Note: Only one of the following attention patches can be used at a time.\n# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.\n\n# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:\nxformers_attention:\n# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:\nflash_attention:\nflash_attn_cross_entropy: # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_rms_norm: # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation\nflash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation\n# Optional[bool]. Whether to use scaled-dot-product attention\n# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention:\n# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention:\n\n# Optional[bool]. Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage:\n# Optional[str]. Resume from a specific checkpoint dir\nresume_from_checkpoint:\n# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: false\n\n## Multimodal section\n# int | tuple[int, int] | None . Size to resize images to, width x height.\n# Will read from model/processor config if not set.\nimage_size:\n# str. Algorithm to use for image resizing. \"bilinear\", \"bicubic\", \"lanczos\". Default is \"bilinear\".\nimage_resize_algorithm: 'bilinear'\n## End of multimodal section\n\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank:\n\n# Add or change special tokens.\n# If you add tokens here, you don't need to add them to the `tokens` list.\nspecial_tokens:\n # bos_token: \"<s>\"\n # eos_token: \"</s>\"\n # unk_token: \"<unk>\"\n # pad_token: \"[PAD]\"\n\n# Optional[list[str]]. Add extra tokens to the tokenizer.\ntokens:\n # - \"<|startoftext|>\"\n # - \"<|endoftext|>\"\n\n# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.\n# Only works for tokens that are not part of the base vocab (aka are added_tokens).\n# Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: # Dict[int, str]\n# 128041: \"<|im_start|>\"\n# 128042: \"<|im_end|>\"\n\n# FSDP\nfsdp:\nfsdp_config:\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed:\n\n# Advanced DDP Arguments\nddp_timeout:\nddp_bucket_cap_mb:\nddp_broadcast_buffers:\n\n# Sequence parallelism\n# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.\n# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.\n# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized\n# subsequences, or set to 4 to split into four equal-sized subsequences.\n# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.\nsequence_parallel_degree:\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\n# Must evenly divide the number of KV heads in your model.\nheads_k_stride: 1\n# One of \"varlen_llama3\", \"batch_ring\", \"batch_zigzag\", \"batch_stripe\". Defaults to \"varlen_llama3\"\n# in the sample packing case, and \"batch_ring\" in the non-sample packing case.\nring_attn_func:\n\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path:\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset:\n\n# Debug mode\ndebug:\n\n# Seed\nseed:\n\n# Allow overwrite yml config using from cli\nstrict:", "crumbs": [ "Getting Started", "Config Reference" @@ -3489,7 +3489,7 @@ "href": "docs/dataset-formats/index.html#pre-training", "title": "Dataset Formats", "section": "Pre-training", - "text": "Pre-training\nWhen aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports streaming to only load batches into memory at a time.\nA sample format for a pre-training dataset is as follows:\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\nIt is typically recommended to save your dataset as .jsonl due to its flexibility and simplicity.\nAxolotl supports loading from a Hugging Face hub repo or from local files.\n\n\n\n\n\n\nImportant\n\n\n\nFor pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.\n\n\n\nPre-training from Hugging Face hub datasets\nAs an example, to train using a Hugging Face dataset hf_org/name, you can pass the following config:\npretraining_dataset: hf_org/name\n\n\nPre-training from local dataset files\nGiven a few corpus files: A.jsonl, B.jsonl, and C.jsonl, your config will look like the below:\npretraining_dataset:\n - path: json\n data_files:\n - A.jsonl\n - B.jsonl\n - C.jsonl\nWhile we recommend .jsonl, you can also use the other formats (csv, parquet, arrow, SQL, Webdataset) that are supported by Dataset.load_dataset\n\n\nPre-training without streaming\nOn the rare case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the completion format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.\nOne benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.\nFrom Hugging Face:\ndatasets:\n - path: hf_org/name\n type: completion\nFrom local files (either example works):\ndatasets:\n - path: A.jsonl\n type: completion\n\n - path: json\n data_files: [\"A.jsonl\", \"B.jsonl\", \"C.jsonl\"]\n type: completion\n\n\nPre-training dataset configuration tips\n\nSetting max_steps\nWhen using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop.\nTherefore, it is necessary to set max_steps: int in your config for pre-training to run, so that Axolotl knows when to stop training.\nOne step is equal to sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus tokens.\n\n\nGroup_by_length\nIt is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large.\n\n\n\nReference\nPlease see docs here.", + "text": "Pre-training\nWhen aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports streaming to only load batches into memory at a time.\nA sample format for a pre-training dataset is as follows:\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\nIt is typically recommended to save your dataset as .jsonl due to its flexibility and simplicity.\nAxolotl supports loading from a Hugging Face hub repo or from local files.\n\nPre-training from Hugging Face hub datasets\nAs an example, to train using a Hugging Face dataset hf_org/name, you can pass the following config:\npretraining_dataset: hf_org/name\n\n\nPre-training from local dataset files\nGiven a few corpus files: A.jsonl, B.jsonl, and C.jsonl, your config will look like the below:\npretraining_dataset:\n - path: json\n data_files:\n - A.jsonl\n - B.jsonl\n - C.jsonl\nWhile we recommend .jsonl, you can also use the other formats (csv, parquet, arrow, SQL, Webdataset) that are supported by Dataset.load_dataset\n\n\nPre-training without streaming\nOn the rare case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the completion format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.\nOne benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.\nFrom Hugging Face:\ndatasets:\n - path: hf_org/name\n type: completion\nFrom local files:\ndatasets:\n - path: A.jsonl\n type: completion\n\n - path: B.jsonl\n type: completion\n\n\n\n\n\n\nImportant\n\n\n\nFor completion only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for pretraining_dataset too, please let us know or help make a PR!\n\n\n\n\nPre-training dataset configuration tips\n\nSetting max_steps\nWhen using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop.\nTherefore, it is necessary to set max_steps: int in your config for pre-training to run, so that Axolotl knows when to stop training.\nOne step is equal to sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus tokens.\n\n\nGroup_by_length\nIt is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large.\n\n\n\nReference\nPlease see docs here.", "crumbs": [ "Dataset Formats" ] diff --git a/sitemap.xml b/sitemap.xml index e20f1c0a5..5ff5ca039 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,734 +2,734 @@ https://docs.axolotl.ai/TODO.html - 2025-05-27T15:45:42.249Z + 2025-05-28T08:51:31.373Z https://docs.axolotl.ai/docs/debugging.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.375Z https://docs.axolotl.ai/docs/rlhf.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/input_output.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.377Z https://docs.axolotl.ai/docs/lr_groups.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/dataset-formats/template_free.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/dataset-formats/pretraining.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/dataset-formats/conversation.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/fsdp_qlora.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.375Z https://docs.axolotl.ai/docs/torchao.html - 2025-05-27T15:45:42.255Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/dataset_preprocessing.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/batch_vs_grad.html - 2025-05-27T15:45:42.250Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/dataset_loading.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/amd_hpc.html - 2025-05-27T15:45:42.250Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/docker.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.375Z https://docs.axolotl.ai/docs/multi-node.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/sequence_parallelism.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/multi-gpu.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/api/utils.collators.core.html - 2025-05-27T15:46:13.694Z + 2025-05-28T08:51:59.475Z https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html - 2025-05-27T15:46:13.722Z + 2025-05-28T08:51:59.502Z https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html - 2025-05-27T15:46:13.662Z + 2025-05-28T08:51:59.442Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html - 2025-05-27T15:46:13.050Z + 2025-05-28T08:51:58.830Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html - 2025-05-27T15:46:12.943Z + 2025-05-28T08:51:58.722Z https://docs.axolotl.ai/docs/api/logging_config.html - 2025-05-27T15:46:12.445Z + 2025-05-28T08:51:58.221Z https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html - 2025-05-27T15:46:13.308Z + 2025-05-28T08:51:59.089Z https://docs.axolotl.ai/docs/api/utils.data.pretraining.html - 2025-05-27T15:46:13.448Z + 2025-05-28T08:51:59.230Z https://docs.axolotl.ai/docs/api/utils.schemas.model.html - 2025-05-27T15:46:13.465Z + 2025-05-28T08:51:59.246Z https://docs.axolotl.ai/docs/api/kernels.utils.html - 2025-05-27T15:46:13.194Z + 2025-05-28T08:51:58.974Z https://docs.axolotl.ai/docs/api/prompt_strategies.base.html - 2025-05-27T15:46:12.911Z + 2025-05-28T08:51:58.690Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html - 2025-05-27T15:46:13.058Z + 2025-05-28T08:51:58.838Z https://docs.axolotl.ai/docs/api/cli.sweeps.html - 2025-05-27T15:46:12.725Z + 2025-05-28T08:51:58.502Z https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html - 2025-05-27T15:46:13.237Z + 2025-05-28T08:51:59.017Z https://docs.axolotl.ai/docs/api/cli.evaluate.html - 2025-05-27T15:46:12.635Z + 2025-05-28T08:51:58.412Z https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html - 2025-05-27T15:46:13.004Z + 2025-05-28T08:51:58.784Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html - 2025-05-27T15:46:13.042Z + 2025-05-28T08:51:58.822Z https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html - 2025-05-27T15:46:13.298Z + 2025-05-28T08:51:59.079Z https://docs.axolotl.ai/docs/api/utils.schemas.config.html - 2025-05-27T15:46:13.458Z + 2025-05-28T08:51:59.239Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html - 2025-05-27T15:46:13.039Z + 2025-05-28T08:51:58.819Z https://docs.axolotl.ai/docs/api/monkeypatch.attention.mllama.html - 2025-05-27T15:46:13.305Z + 2025-05-28T08:51:59.086Z https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html - 2025-05-27T15:46:12.572Z + 2025-05-28T08:51:58.349Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html - 2025-05-27T15:46:12.945Z + 2025-05-28T08:51:58.724Z https://docs.axolotl.ai/docs/api/common.datasets.html - 2025-05-27T15:46:13.692Z + 2025-05-28T08:51:59.472Z https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html - 2025-05-27T15:46:12.994Z + 2025-05-28T08:51:58.773Z https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html - 2025-05-27T15:46:13.290Z + 2025-05-28T08:51:59.071Z https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html - 2025-05-27T15:46:13.653Z + 2025-05-28T08:51:59.434Z https://docs.axolotl.ai/docs/api/utils.chat_templates.html - 2025-05-27T15:46:13.354Z + 2025-05-28T08:51:59.134Z https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html - 2025-05-27T15:46:13.280Z + 2025-05-28T08:51:59.061Z https://docs.axolotl.ai/docs/api/utils.lora.html - 2025-05-27T15:46:13.358Z + 2025-05-28T08:51:59.139Z https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html - 2025-05-27T15:46:13.337Z + 2025-05-28T08:51:59.118Z https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html - 2025-05-27T15:46:12.817Z + 2025-05-28T08:51:58.595Z https://docs.axolotl.ai/docs/api/cli.inference.html - 2025-05-27T15:46:12.690Z + 2025-05-28T08:51:58.468Z https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html - 2025-05-27T15:46:12.989Z + 2025-05-28T08:51:58.769Z https://docs.axolotl.ai/docs/api/utils.bench.html - 2025-05-27T15:46:13.367Z + 2025-05-28T08:51:59.148Z https://docs.axolotl.ai/docs/api/loaders.model.html - 2025-05-27T15:46:12.849Z + 2025-05-28T08:51:58.628Z https://docs.axolotl.ai/docs/api/core.trainers.relora.html - 2025-05-27T15:46:12.811Z + 2025-05-28T08:51:58.588Z https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html - 2025-05-27T15:46:13.672Z + 2025-05-28T08:51:59.452Z https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html - 2025-05-27T15:46:12.826Z + 2025-05-28T08:51:58.604Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html - 2025-05-27T15:46:13.246Z + 2025-05-28T08:51:59.026Z https://docs.axolotl.ai/docs/api/utils.schemas.peft.html - 2025-05-27T15:46:13.496Z + 2025-05-28T08:51:59.277Z https://docs.axolotl.ai/docs/api/loaders.patch_manager.html - 2025-05-27T15:46:12.873Z + 2025-05-28T08:51:58.651Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html - 2025-05-27T15:46:13.281Z + 2025-05-28T08:51:59.063Z https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html - 2025-05-27T15:46:13.016Z + 2025-05-28T08:51:58.795Z https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html - 2025-05-27T15:46:13.693Z + 2025-05-28T08:51:59.473Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html - 2025-05-27T15:46:13.017Z + 2025-05-28T08:51:58.797Z https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html - 2025-05-27T15:46:13.287Z + 2025-05-28T08:51:59.068Z https://docs.axolotl.ai/docs/api/utils.schemas.enums.html - 2025-05-27T15:46:13.525Z + 2025-05-28T08:51:59.306Z https://docs.axolotl.ai/docs/api/loaders.constants.html - 2025-05-27T15:46:12.874Z + 2025-05-28T08:51:58.652Z https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html - 2025-05-27T15:46:13.001Z + 2025-05-28T08:51:58.780Z https://docs.axolotl.ai/docs/api/kernels.swiglu.html - 2025-05-27T15:46:13.186Z + 2025-05-28T08:51:58.965Z https://docs.axolotl.ai/docs/api/core.trainers.base.html - 2025-05-27T15:46:12.784Z + 2025-05-28T08:51:58.562Z https://docs.axolotl.ai/docs/api/monkeypatch.relora.html - 2025-05-27T15:46:13.244Z + 2025-05-28T08:51:59.025Z https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html - 2025-05-27T15:46:12.770Z + 2025-05-28T08:51:58.548Z https://docs.axolotl.ai/docs/api/core.training_args.html - 2025-05-27T15:46:12.547Z + 2025-05-28T08:51:58.324Z https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html - 2025-05-27T15:46:13.781Z + 2025-05-28T08:51:59.561Z https://docs.axolotl.ai/docs/api/core.chat.format.shared.html - 2025-05-27T15:46:12.575Z + 2025-05-28T08:51:58.352Z https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html - 2025-05-27T15:46:13.271Z + 2025-05-28T08:51:59.052Z https://docs.axolotl.ai/docs/api/cli.merge_lora.html - 2025-05-27T15:46:12.699Z + 2025-05-28T08:51:58.476Z https://docs.axolotl.ai/docs/api/utils.trainer.html - 2025-05-27T15:46:13.392Z + 2025-05-28T08:51:59.173Z https://docs.axolotl.ai/docs/api/utils.dict.html - 2025-05-27T15:46:13.439Z + 2025-05-28T08:51:59.221Z https://docs.axolotl.ai/docs/api/kernels.quantize.html - 2025-05-27T15:46:13.193Z + 2025-05-28T08:51:58.973Z https://docs.axolotl.ai/docs/api/core.trainers.utils.html - 2025-05-27T15:46:12.840Z + 2025-05-28T08:51:58.618Z https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html - 2025-05-27T15:46:13.307Z + 2025-05-28T08:51:59.088Z https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html - 2025-05-27T15:46:12.910Z + 2025-05-28T08:51:58.689Z https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html - 2025-05-27T15:46:12.838Z + 2025-05-28T08:51:58.616Z https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html - 2025-05-27T15:46:12.965Z + 2025-05-28T08:51:58.744Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html - 2025-05-27T15:46:13.220Z + 2025-05-28T08:51:59.000Z https://docs.axolotl.ai/docs/api/cli.train.html - 2025-05-27T15:46:12.627Z + 2025-05-28T08:51:58.404Z https://docs.axolotl.ai/docs/api/datasets.html - 2025-05-27T15:46:12.381Z + 2025-05-28T08:51:58.161Z https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html - 2025-05-27T15:45:42.271Z + 2025-05-28T08:51:31.395Z https://docs.axolotl.ai/index.html - 2025-05-27T15:45:42.267Z + 2025-05-28T08:51:31.391Z https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html - 2025-05-27T15:45:42.255Z + 2025-05-28T08:51:31.379Z https://docs.axolotl.ai/FAQS.html - 2025-05-27T15:45:42.249Z + 2025-05-28T08:51:31.372Z https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html - 2025-05-27T15:45:42.271Z + 2025-05-28T08:51:31.395Z https://docs.axolotl.ai/docs/api/cli.utils.html - 2025-05-27T15:46:12.756Z + 2025-05-28T08:51:58.533Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html - 2025-05-27T15:46:12.877Z + 2025-05-28T08:51:58.656Z https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html - 2025-05-27T15:46:13.079Z + 2025-05-28T08:51:58.859Z https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html - 2025-05-27T15:46:12.983Z + 2025-05-28T08:51:58.763Z https://docs.axolotl.ai/docs/api/evaluate.html - 2025-05-27T15:46:12.374Z + 2025-05-28T08:51:58.153Z https://docs.axolotl.ai/docs/api/loaders.processor.html - 2025-05-27T15:46:12.859Z + 2025-05-28T08:51:58.637Z https://docs.axolotl.ai/docs/api/common.const.html - 2025-05-27T15:46:13.675Z + 2025-05-28T08:51:59.455Z https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html - 2025-05-27T15:46:13.083Z + 2025-05-28T08:51:58.863Z https://docs.axolotl.ai/docs/api/utils.distributed.html - 2025-05-27T15:46:13.436Z + 2025-05-28T08:51:59.217Z https://docs.axolotl.ai/docs/api/utils.tokenization.html - 2025-05-27T15:46:13.344Z + 2025-05-28T08:51:59.125Z https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html - 2025-05-27T15:46:13.517Z + 2025-05-28T08:51:59.298Z https://docs.axolotl.ai/docs/api/utils.schedulers.html - 2025-05-27T15:46:13.416Z + 2025-05-28T08:51:59.198Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html - 2025-05-27T15:46:12.887Z + 2025-05-28T08:51:58.666Z https://docs.axolotl.ai/docs/api/core.datasets.chat.html - 2025-05-27T15:46:12.580Z + 2025-05-28T08:51:58.357Z https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html - 2025-05-27T15:46:13.777Z + 2025-05-28T08:51:59.558Z https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html - 2025-05-27T15:46:13.236Z + 2025-05-28T08:51:59.016Z https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html - 2025-05-27T15:46:12.573Z + 2025-05-28T08:51:58.350Z https://docs.axolotl.ai/docs/api/cli.checks.html - 2025-05-27T15:46:12.659Z + 2025-05-28T08:51:58.436Z https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html - 2025-05-27T15:46:13.297Z + 2025-05-28T08:51:59.078Z https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html - 2025-05-27T15:46:12.977Z + 2025-05-28T08:51:58.757Z https://docs.axolotl.ai/docs/api/convert.html - 2025-05-27T15:46:12.395Z + 2025-05-28T08:51:58.174Z https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html - 2025-05-27T15:46:13.487Z + 2025-05-28T08:51:59.269Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html - 2025-05-27T15:46:13.027Z + 2025-05-28T08:51:58.807Z https://docs.axolotl.ai/docs/api/cli.args.html - 2025-05-27T15:46:12.652Z + 2025-05-28T08:51:58.429Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html - 2025-05-27T15:46:13.221Z + 2025-05-28T08:51:59.002Z https://docs.axolotl.ai/docs/api/train.html - 2025-05-27T15:46:12.363Z + 2025-05-28T08:51:58.143Z https://docs.axolotl.ai/docs/api/core.trainer_builder.html - 2025-05-27T15:46:12.460Z + 2025-05-28T08:51:58.236Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html - 2025-05-27T15:46:13.037Z + 2025-05-28T08:51:58.817Z https://docs.axolotl.ai/docs/api/cli.main.html - 2025-05-27T15:46:12.619Z + 2025-05-28T08:51:58.396Z https://docs.axolotl.ai/docs/api/index.html - 2025-05-27T15:46:12.303Z + 2025-05-28T08:51:58.082Z https://docs.axolotl.ai/docs/api/cli.preprocess.html - 2025-05-27T15:46:12.719Z + 2025-05-28T08:51:58.496Z https://docs.axolotl.ai/docs/api/utils.freeze.html - 2025-05-27T15:46:13.375Z + 2025-05-28T08:51:59.156Z https://docs.axolotl.ai/docs/api/utils.data.sft.html - 2025-05-27T15:46:13.450Z + 2025-05-28T08:51:59.231Z https://docs.axolotl.ai/docs/api/integrations.liger.args.html - 2025-05-27T15:46:13.665Z + 2025-05-28T08:51:59.446Z https://docs.axolotl.ai/docs/api/loaders.adapter.html - 2025-05-27T15:46:12.864Z + 2025-05-28T08:51:58.643Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html - 2025-05-27T15:46:12.881Z + 2025-05-28T08:51:58.659Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html - 2025-05-27T15:46:13.059Z + 2025-05-28T08:51:58.839Z https://docs.axolotl.ai/docs/api/monkeypatch.utils.html - 2025-05-27T15:46:13.278Z + 2025-05-28T08:51:59.060Z https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html - 2025-05-27T15:46:13.505Z + 2025-05-28T08:51:59.286Z https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html - 2025-05-27T15:46:12.929Z + 2025-05-28T08:51:58.708Z https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html - 2025-05-27T15:46:13.654Z + 2025-05-28T08:51:59.435Z https://docs.axolotl.ai/docs/api/core.trainers.mamba.html - 2025-05-27T15:46:12.806Z + 2025-05-28T08:51:58.584Z https://docs.axolotl.ai/docs/api/utils.schemas.trl.html - 2025-05-27T15:46:13.499Z + 2025-05-28T08:51:59.280Z https://docs.axolotl.ai/docs/api/cli.config.html - 2025-05-27T15:46:12.676Z + 2025-05-28T08:51:58.453Z https://docs.axolotl.ai/docs/api/cli.vllm_serve.html - 2025-05-27T15:46:12.761Z + 2025-05-28T08:51:58.538Z https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html - 2025-05-27T15:46:12.588Z + 2025-05-28T08:51:58.365Z https://docs.axolotl.ai/docs/api/loaders.tokenizer.html - 2025-05-27T15:46:12.857Z + 2025-05-28T08:51:58.636Z https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html - 2025-05-27T15:46:13.772Z + 2025-05-28T08:51:59.552Z https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html - 2025-05-27T15:46:12.711Z + 2025-05-28T08:51:58.488Z https://docs.axolotl.ai/docs/api/utils.collators.mamba.html - 2025-05-27T15:46:13.717Z + 2025-05-28T08:51:59.498Z https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html - 2025-05-27T15:46:13.762Z + 2025-05-28T08:51:59.542Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html - 2025-05-27T15:46:13.040Z + 2025-05-28T08:51:58.820Z https://docs.axolotl.ai/docs/api/utils.schemas.training.html - 2025-05-27T15:46:13.470Z + 2025-05-28T08:51:59.251Z https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html - 2025-05-27T15:46:13.769Z + 2025-05-28T08:51:59.549Z https://docs.axolotl.ai/docs/api/kernels.lora.html - 2025-05-27T15:46:13.165Z + 2025-05-28T08:51:58.945Z https://docs.axolotl.ai/docs/api/core.chat.messages.html - 2025-05-27T15:46:12.570Z + 2025-05-28T08:51:58.347Z https://docs.axolotl.ai/docs/api/integrations.base.html - 2025-05-27T15:46:13.650Z + 2025-05-28T08:51:59.431Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html - 2025-05-27T15:46:12.957Z + 2025-05-28T08:51:58.736Z https://docs.axolotl.ai/docs/api/utils.collators.batching.html - 2025-05-27T15:46:13.713Z + 2025-05-28T08:51:59.494Z https://docs.axolotl.ai/docs/api/core.trainers.trl.html - 2025-05-27T15:46:12.802Z + 2025-05-28T08:51:58.579Z https://docs.axolotl.ai/docs/api/utils.schemas.utils.html - 2025-05-27T15:46:13.531Z + 2025-05-28T08:51:59.312Z https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html - 2025-05-27T15:46:13.364Z + 2025-05-28T08:51:59.145Z https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html - 2025-05-27T15:46:13.668Z + 2025-05-28T08:51:59.449Z https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html - 2025-05-27T15:46:13.311Z + 2025-05-28T08:51:59.092Z https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html - 2025-05-27T15:46:13.774Z + 2025-05-28T08:51:59.554Z https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html - 2025-05-27T15:46:13.447Z + 2025-05-28T08:51:59.228Z https://docs.axolotl.ai/docs/api/common.architectures.html - 2025-05-27T15:46:13.673Z + 2025-05-28T08:51:59.454Z https://docs.axolotl.ai/docs/api/cli.cloud.base.html - 2025-05-27T15:46:12.764Z + 2025-05-28T08:51:58.541Z https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html - 2025-05-27T15:46:13.011Z + 2025-05-28T08:51:58.791Z https://docs.axolotl.ai/docs/api/prompt_tokenizers.html - 2025-05-27T15:46:12.439Z + 2025-05-28T08:51:58.216Z https://docs.axolotl.ai/docs/api/kernels.geglu.html - 2025-05-27T15:46:13.175Z + 2025-05-28T08:51:58.955Z https://docs.axolotl.ai/docs/custom_integrations.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/multimodal.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/faq.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.375Z https://docs.axolotl.ai/docs/multipack.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/lora_optims.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.377Z https://docs.axolotl.ai/docs/nccl.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/cli.html - 2025-05-27T15:45:42.250Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/config.html - 2025-05-27T15:45:42.250Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/unsloth.html - 2025-05-27T15:45:42.255Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/ray-integration.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/dataset-formats/index.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/dataset-formats/tokenized.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.374Z https://docs.axolotl.ai/docs/installation.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.377Z https://docs.axolotl.ai/docs/inference.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.377Z https://docs.axolotl.ai/docs/mac.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.378Z https://docs.axolotl.ai/docs/getting-started.html - 2025-05-27T15:45:42.251Z + 2025-05-28T08:51:31.375Z https://docs.axolotl.ai/docs/reward_modelling.html - 2025-05-27T15:45:42.254Z + 2025-05-28T08:51:31.378Z diff --git a/src/axolotl/integrations/LICENSE.html b/src/axolotl/integrations/LICENSE.html index 330c7f0c6..dfbbed041 100644 --- a/src/axolotl/integrations/LICENSE.html +++ b/src/axolotl/integrations/LICENSE.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + + diff --git a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html index a6e75d697..53084c250 100644 --- a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html +++ b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html @@ -67,6 +67,15 @@ ul.task-list li input[type="checkbox"] { "search-label": "Search" } } + + +